allpy: 48dc5974eead lib/project.py

allpy

view lib/project.py @ 122:48dc5974eead

lib::block::geometrical_core fixes 1. gap treatment 2. only one (or zero) distance case

author	boris <bnagaev@gmail.com>
date	Sat, 23 Oct 2010 23:16:38 +0400
parents	62e38970ffa2
children	0b366ce3257f

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 from monomer import AminoAcidType

12 import allpy_data

13 import os

14 from tempfile import mkstemp

15 import block

17 Block = block.Block

19 class Project(object):

20 """

21 Mandatory data:

22 * sequences -- list of Sequence objects. Sequences don't contain gaps

23 - see sequence.py module

24 * alignment -- dict

25 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

26 keys are the Sequence objects, values are the lists, which

27 contain monomers of those sequences or None for gaps in the

28 corresponding sequence of

29 alignment

31 """

32 def __init__(self, *args):

33 """overloaded constructor

35 Project() -> new empty Project

36 Project(sequences, alignment) -> new Project with sequences and

37 alignment initialized from arguments

38 Project(fasta_file) -> new Project, read alignment and sequences

39 from fasta file

41 """

42 if len(args)>1:#overloaded constructor

43 self.sequences=args[0]

44 self.alignment=args[1]

45 elif len(args)==0:

46 self.sequences=[]

47 self.alignment={}

48 else:

49 self.sequences,self.alignment=Project.from_fasta(args[0])

51 def __len__(self):

52 return max([len(line) for line in self.alignment.values()])

54 def thickness(self):

55 """ The number of sequences in alignment (it's thickness).

56 """

57 return len(self.alignment)

59 def calc_identity(self):

60 """ Calculate the identity of alignment positions for colouring.

62 For every (row, column) in alignment the percentage of the exactly

63 same residue in the same column in the alignment is calculated.

64 The data structure is just like the Project.alignment, but istead of

65 monomers it contains float percentages.

66 """

67 # Oh, God, that's awful! Absolutely not understandable.

68 # First, calculate percentages of amino acids in every column

69 contribution = 1.0 / len(self.sequences)

70 all_columns = []

71 for position in range(len(self)):

72 column_percentage = {}

73 for seq in self.alignment:

74 if self.alignment[seq][position] is not None:

75 aa = self.alignment[seq][position].code

76 else:

77 aa = None

78 if aa in allpy_data.amino_acids:

79 if aa in column_percentage.keys():

80 column_percentage[aa] += contribution

81 else:

82 column_percentage[aa] = contribution

83 all_columns.append(column_percentage)

84 # Second, map these percentages onto the alignment

85 self.identity_percentages = {}

86 for seq in self.sequences:

87 self.identity_percentages[seq] = []

88 for seq in self.identity_percentages:

89 line = self.identity_percentages[seq]

90 for position in range(len(self)):

91 if self.alignment[seq][position] is not None:

92 aa = self.alignment[seq][position].code

93 else:

94 aa = None

95 line.append(all_columns[position].get(aa))

96 return self.identity_percentages

98 @staticmethod

99 def from_fasta(file, monomer_kind=AminoAcidType):

100 """

101 >>> import project

102 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))

103 """

104 import re

105

106 sequences=[]

107 alignment={}

108

109 content=file.read()

110 raw_sequences=content.split(">")[1:]#ignore everything before the first >

111 for raw in raw_sequences:

112 parsed_raw_sequence = raw.split("\n")

113 for counter,piece in enumerate(parsed_raw_sequence):

114 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

115 name_and_description = parsed_raw_sequence[0]

116 if len(name_and_description.split(" ",1))==2:

117 name, description = name_and_description.split(" ",1)

118 elif len(name_and_description.split(" ", 1)) == 1:#if there is description

119 name = name_and_description

120 else:

121 raise "Wrong name of sequence in fasta file"

122 string=""

123 for piece in parsed_raw_sequence[1:]:

124 piece_without_whitespace_chars=re.sub("\s","",piece)

125 string+=piece_without_whitespace_chars

126 monomers=[]#convert into Monomer objects

127 alignment_list=[]#create the respective list in alignment dict

128 for current_monomer in string:

129 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

130 monomers.append(monomer_kind.from_code1(current_monomer).instance())

131 alignment_list.append(monomers[-1])

132 else:

133 alignment_list.append(None)

134 if "description" in vars():#if there's no description

135 sequences.append(sequence.Sequence(monomers,name,description))

136 else:

137 sequences.append(sequence.Sequence(monomers,name))

138 alignment[sequences[-1]]=alignment_list

139 return sequences,alignment

140

141

142 @staticmethod

143 def from_sequences(*sequences):

144 """

145 Constructs new alignment from sequences

146 Add gaps to right end to make equal lengthes of alignment sequences

147 """

148 project = Project()

149 project.sequences = sequences

150 max_length = max(len(sequence) for sequence in sequences)

151 for sequence in sequences:

152 gaps_count = max_length - len(sequence)

153 project.alignment[sequence] = sequence.monomers + [None] * gaps_count

154 return project

155

156 def save_fasta(self, out_file, long_line=60):

157 """

158 Saves alignment to given file

159 Splits long lines to substrings of length=long_line

160 To prevent this, set long_line=None

161 """

162 Block(self).save_fasta(out_file, long_line)

163

164 def muscle_align(self):

165 """

166 Simple align ths alignment using sequences (muscle)

167 uses old Monomers and Sequences objects

168 """

169 tmp_file, tmp_filename = mkstemp()

170 os.close(tmp_file) # this is file descriptor, not normal file object. security issue )

171 tmp_file = open(tmp_filename, 'w')

172 self.save_fasta(tmp_file)

173 tmp_file.close()

174 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_filename})

175 sequences, alignment = Project.from_fasta(open(tmp_filename))

176 for sequence in self.sequences:

177 try:

178 new_sequence = [i for i in sequences if sequence==i][0]

179 except:

180 raise Exception("Align: Cann't find sequence %s in muscle output" % sequence.name)

181 old_monomers = iter(sequence.monomers)

182 self.alignment[sequence] = []

183 for monomer in alignment[new_sequence]:

184 if not monomer:

185 self.alignment[sequence].append(monomer)

186 else:

187 old_monomer = old_monomers.next()

188 if monomer != old_monomer:

189 raise Exception("Align: alignment errors")

190 self.alignment[sequence].append(old_monomer)

191

192 def column(self, sequence=None, sequences=None, original=None):

193 """

194 returns list of columns of alignment

195 sequence or sequences:

196 if sequence is given, then column is (original_monomer, monomer)

197 if sequences is given, then column is dict (original_monomer, {sequence: monomer})

198 if both of them are given, it is an error

199 original (Sequence type):

200 if given, this filters only columns represented by original sequence

201 """

202 if sequence and sequences:

203 raise Exception("Wrong usage. read help")

204 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])

205 alignment = self.alignment.items()

206 alignment.sort(key=lambda i: indexes[i[0]])

207 alignment = [monomers for seq, monomers in alignment]

208 for column in zip(*alignment):

209 if not original or column[indexes[original]]:

210 if sequence:

211 yield (column[indexes[original]], column[indexes[sequence]])

212 else:

213 yield (column[indexes[original]], dict([(s, column[indexes[s]]) for s in sequences]))

214