Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/e10310ed076c/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:31:41 2013
Кодировка:
allpy: e10310ed076c lib/project.py

allpy

view lib/project.py @ 140:e10310ed076c

geometrical_core.py: run building of GCs
author boris <bnagaev@gmail.com>
date Sun, 24 Oct 2010 18:15:41 +0400
parents 32b9f4fadd35
children 004c2f6c45ac
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 from monomer import AminoAcidType
12 import allpy_data
13 import os
14 from tempfile import mkstemp
15 import block
17 Block = block.Block
19 class Project(object):
20 """
21 Mandatory data:
22 * sequences -- list of Sequence objects. Sequences don't contain gaps
23 - see sequence.py module
24 * alignment -- dict
25 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
26 keys are the Sequence objects, values are the lists, which
27 contain monomers of those sequences or None for gaps in the
28 corresponding sequence of
29 alignment
31 """
32 def __init__(self, *args):
33 """overloaded constructor
35 Project() -> new empty Project
36 Project(sequences, alignment) -> new Project with sequences and
37 alignment initialized from arguments
38 Project(fasta_file) -> new Project, read alignment and sequences
39 from fasta file
41 """
42 if len(args)>1:#overloaded constructor
43 self.sequences=args[0]
44 self.alignment=args[1]
45 elif len(args)==0:
46 self.sequences=[]
47 self.alignment={}
48 else:
49 self.sequences,self.alignment=Project.from_fasta(args[0])
51 def __len__(self):
52 return max([len(line) for line in self.alignment.values()])
54 def thickness(self):
55 """ The number of sequences in alignment (it's thickness).
56 """
57 return len(self.alignment)
59 def calc_identity(self):
60 """ Calculate the identity of alignment positions for colouring.
62 For every (row, column) in alignment the percentage of the exactly
63 same residue in the same column in the alignment is calculated.
64 The data structure is just like the Project.alignment, but istead of
65 monomers it contains float percentages.
66 """
67 # Oh, God, that's awful! Absolutely not understandable.
68 # First, calculate percentages of amino acids in every column
69 contribution = 1.0 / len(self.sequences)
70 all_columns = []
71 for position in range(len(self)):
72 column_percentage = {}
73 for seq in self.alignment:
74 if self.alignment[seq][position] is not None:
75 aa = self.alignment[seq][position].code
76 else:
77 aa = None
78 if aa in allpy_data.amino_acids:
79 if aa in column_percentage.keys():
80 column_percentage[aa] += contribution
81 else:
82 column_percentage[aa] = contribution
83 all_columns.append(column_percentage)
84 # Second, map these percentages onto the alignment
85 self.identity_percentages = {}
86 for seq in self.sequences:
87 self.identity_percentages[seq] = []
88 for seq in self.identity_percentages:
89 line = self.identity_percentages[seq]
90 for position in range(len(self)):
91 if self.alignment[seq][position] is not None:
92 aa = self.alignment[seq][position].code
93 else:
94 aa = None
95 line.append(all_columns[position].get(aa))
96 return self.identity_percentages
98 @staticmethod
99 def from_fasta(file, monomer_kind=AminoAcidType):
100 """
101 >>> import project
102 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))
103 """
104 import re
106 sequences=[]
107 alignment={}
109 content=file.read()
110 raw_sequences=content.split(">")[1:]#ignore everything before the first >
111 for raw in raw_sequences:
112 parsed_raw_sequence = raw.split("\n")
113 for counter,piece in enumerate(parsed_raw_sequence):
114 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
115 name_and_description = parsed_raw_sequence[0]
116 if len(name_and_description.split(" ",1))==2:
117 name, description = name_and_description.split(" ",1)
118 elif len(name_and_description.split(" ", 1)) == 1:#if there is description
119 name = name_and_description
120 else:
121 raise "Wrong name of sequence in fasta file"
122 string=""
123 for piece in parsed_raw_sequence[1:]:
124 piece_without_whitespace_chars=re.sub("\s","",piece)
125 string+=piece_without_whitespace_chars
126 monomers=[]#convert into Monomer objects
127 alignment_list=[]#create the respective list in alignment dict
128 for current_monomer in string:
129 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
130 monomers.append(monomer_kind.from_code1(current_monomer).instance())
131 alignment_list.append(monomers[-1])
132 else:
133 alignment_list.append(None)
134 if "description" in vars():#if there's no description
135 sequences.append(sequence.Sequence(monomers,name,description))
136 else:
137 sequences.append(sequence.Sequence(monomers,name))
138 alignment[sequences[-1]]=alignment_list
139 return sequences,alignment
142 @staticmethod
143 def from_sequences(*sequences):
144 """
145 Constructs new alignment from sequences
146 Add gaps to right end to make equal lengthes of alignment sequences
147 """
148 project = Project()
149 project.sequences = sequences
150 max_length = max(len(sequence) for sequence in sequences)
151 for sequence in sequences:
152 gaps_count = max_length - len(sequence)
153 project.alignment[sequence] = sequence.monomers + [None] * gaps_count
154 return project
156 def save_fasta(self, out_file, long_line=60, gap='-'):
157 """
158 Saves alignment to given file
159 Splits long lines to substrings of length=long_line
160 To prevent this, set long_line=None
161 """
162 Block(self).save_fasta(out_file, long_line=long_line, gap=gap)
164 def muscle_align(self):
165 """
166 Simple align ths alignment using sequences (muscle)
167 uses old Monomers and Sequences objects
168 """
169 tmp_file, tmp_filename = mkstemp()
170 os.close(tmp_file) # this is file descriptor, not normal file object.
171 tmp_file = open(tmp_filename, 'w')
172 self.save_fasta(tmp_file)
173 tmp_file.close()
174 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_filename})
175 sequences, alignment = Project.from_fasta(open(tmp_filename))
176 for sequence in self.sequences:
177 try:
178 new_sequence = [i for i in sequences if sequence==i][0]
179 except:
180 raise Exception("Align: Cann't find sequence %s in muscle output" % \
181 sequence.name)
182 old_monomers = iter(sequence.monomers)
183 self.alignment[sequence] = []
184 for monomer in alignment[new_sequence]:
185 if not monomer:
186 self.alignment[sequence].append(monomer)
187 else:
188 old_monomer = old_monomers.next()
189 if monomer != old_monomer:
190 raise Exception("Align: alignment errors")
191 self.alignment[sequence].append(old_monomer)
193 def column(self, sequence=None, sequences=None, original=None):
194 """
195 returns list of columns of alignment
196 sequence or sequences:
197 if sequence is given, then column is (original_monomer, monomer)
198 if sequences is given, then column is (original_monomer, {sequence: monomer})
199 if both of them are given, it is an error
200 original (Sequence type):
201 if given, this filters only columns represented by original sequence
202 """
203 if sequence and sequences:
204 raise Exception("Wrong usage. read help")
205 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)])
206 alignment = self.alignment.items()
207 alignment.sort(key=lambda i: indexes[i[0]])
208 alignment = [monomers for seq, monomers in alignment]
209 for column in zip(*alignment):
210 if not original or column[indexes[original]]:
211 if sequence:
212 yield (column[indexes[original]], column[indexes[sequence]])
213 else:
214 yield (column[indexes[original]],
215 dict([(s, column[indexes[s]]) for s in sequences]))
217 def pdb_auto_add(self, conformity_file=None):
218 """
219 Adds pdb information to each sequence
221 TODO: conformity_file
222 """
223 conformity = {}
225 for sequence in self.sequences:
226 sequence.pdb_auto_add(conformity.get(sequence.name, None))