rev |
line source |
me@261
|
1 import sys |
me@261
|
2 import os |
me@262
|
3 import os.path |
me@261
|
4 from tempfile import NamedTemporaryFile |
me@262
|
5 import urllib2 |
me@261
|
6 |
me@261
|
7 import config |
me@261
|
8 from graph import Graph |
me@262
|
9 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO |
me@262
|
10 from Bio.PDB.DSSP import make_dssp_dict |
me@262
|
11 from allpy.pdb import std_id, pdb_id_parse, get_structure |
bnagaev@249
|
12 from fasta import save_fasta |
me@260
|
13 import data.codes |
me@260
|
14 |
me@260
|
15 class MonomerType(object): |
me@260
|
16 """Class of monomer types. |
me@260
|
17 |
me@260
|
18 Each MonomerType object represents a known monomer type, e.g. Valine, |
me@260
|
19 and is referenced to by each instance of monomer in a given sequence. |
me@260
|
20 |
me@260
|
21 - `name`: full name of monomer type |
me@260
|
22 - `code1`: one-letter code |
me@260
|
23 - `code3`: three-letter code |
me@260
|
24 - `is_modified`: either of True or False |
me@260
|
25 |
me@260
|
26 class atributes: |
me@260
|
27 |
me@260
|
28 - `by_code1`: a mapping from one-letter code to MonomerType object |
me@260
|
29 - `by_code3`: a mapping from three-letter code to MonomerType object |
me@260
|
30 - `by_name`: a mapping from monomer name to MonomerType object |
me@260
|
31 - `instance_type`: class of Monomer objects to use when creating new |
me@260
|
32 objects; this must be redefined in descendent classes |
me@260
|
33 |
me@260
|
34 All of the class attributes MUST be redefined when subclassing. |
me@260
|
35 """ |
me@260
|
36 |
me@260
|
37 by_code1 = {} |
me@260
|
38 by_code3 = {} |
me@260
|
39 by_name = {} |
me@260
|
40 instance_type = None |
me@260
|
41 |
me@260
|
42 def __init__(self, name="", code1="", code3="", is_modified=False): |
me@260
|
43 self.name = name.capitalize() |
me@260
|
44 self.code1 = code1.upper() |
me@260
|
45 self.code3 = code3.upper() |
me@260
|
46 self.is_modified = bool(is_modified) |
me@260
|
47 if not is_modified: |
me@260
|
48 self.by_code1[self.code1] = self |
me@260
|
49 self.by_code3[code3] = self |
me@260
|
50 self.by_name[name] = self |
me@260
|
51 # We duplicate distinguished long names into MonomerType itself, |
me@260
|
52 # so that we can use MonomerType.from_code3 to create the relevant |
me@260
|
53 # type of monomer. |
me@260
|
54 MonomerType.by_code3[code3] = self |
me@260
|
55 MonomerType.by_name[name] = self |
me@260
|
56 |
me@260
|
57 @classmethod |
me@260
|
58 def _initialize(cls, type_letter, codes=data.codes.codes): |
me@260
|
59 """Create all relevant instances of MonomerType. |
me@260
|
60 |
me@260
|
61 `type_letter` is either of: |
me@260
|
62 |
me@260
|
63 - 'p' for protein |
me@260
|
64 - 'd' for DNA |
me@260
|
65 - 'r' for RNA |
me@260
|
66 |
me@260
|
67 `codes` is a table of monomer codes |
me@260
|
68 """ |
me@260
|
69 for type, code1, is_modified, code3, name in codes: |
me@260
|
70 if type == type_letter: |
me@260
|
71 cls(name, code1, code3, is_modified) |
me@260
|
72 |
me@260
|
73 @classmethod |
me@260
|
74 def from_code1(cls, code1): |
me@260
|
75 """Return monomer type by one-letter code.""" |
me@260
|
76 return cls.by_code1[code1.upper()] |
me@260
|
77 |
me@260
|
78 @classmethod |
me@260
|
79 def from_code3(cls, code3): |
me@260
|
80 """Return monomer type by three-letter code.""" |
me@260
|
81 return cls.by_code3[code3.upper()] |
me@260
|
82 |
me@260
|
83 @classmethod |
me@260
|
84 def from_name(cls, name): |
me@260
|
85 """Return monomer type by name.""" |
me@260
|
86 return cls.by_name[name.capitalize()] |
me@260
|
87 |
me@260
|
88 def instance(self): |
me@260
|
89 """Create a new monomer of given type.""" |
me@260
|
90 return self.instance_type(self) |
me@260
|
91 |
me@260
|
92 def __eq__(self, other): |
me@260
|
93 if hasattr(other, "type"): |
me@260
|
94 return self is other.type |
me@260
|
95 return self is other |
me@260
|
96 |
me@260
|
97 class Monomer(object): |
me@260
|
98 """Monomer object. |
me@260
|
99 |
me@260
|
100 attributes: |
me@260
|
101 |
me@260
|
102 - `type`: type of monomer (a MonomerType object) |
me@260
|
103 |
me@260
|
104 class attribute `monomer_type` is MonomerType or either of it's subclasses, |
me@260
|
105 it is used when creating new monomers. It MUST be redefined when subclassing Monomer. |
me@260
|
106 """ |
me@260
|
107 monomer_type = MonomerType |
me@260
|
108 |
me@260
|
109 def __init__(self, type): |
me@260
|
110 self.type = type |
me@260
|
111 |
me@260
|
112 @classmethod |
me@260
|
113 def from_code1(cls, code1): |
me@260
|
114 return cls(cls.monomer_type.by_code1[code1.upper()]) |
me@260
|
115 |
me@260
|
116 @classmethod |
me@260
|
117 def from_code3(cls, code3): |
me@260
|
118 return cls(cls.monomer_type.by_code3[code3.upper()]) |
me@260
|
119 |
me@260
|
120 @classmethod |
me@260
|
121 def from_name(cls, name): |
me@260
|
122 return cls(cls.monomer_type.by_name[name.capitalize()]) |
me@260
|
123 |
me@260
|
124 def __eq__(self, other): |
me@260
|
125 if hasattr(other, "type"): |
me@260
|
126 return self.type is other.type |
me@260
|
127 return self.type is other |
bnagaev@239
|
128 |
bnagaev@239
|
129 class Sequence(list): |
bnagaev@243
|
130 """ Sequence of Monomers |
bnagaev@243
|
131 |
me@262
|
132 list of monomer objects (aminoacids or nucleotides) |
bnagaev@243
|
133 |
bnagaev@243
|
134 Mandatory data: |
me@266
|
135 |
bnagaev@243
|
136 * name -- str with the name of sequence |
bnagaev@243
|
137 * description -- str with description of the sequence |
me@262
|
138 |
me@262
|
139 Optional (may be empty): |
me@266
|
140 |
me@262
|
141 * source -- source of sequence |
me@262
|
142 * pdb_chain -- Bio.PDB.Chain |
me@262
|
143 * pdb_file -- file object |
me@262
|
144 |
me@262
|
145 * pdb_residues -- {Monomer: Bio.PDB.Residue} |
me@262
|
146 * pdb_secstr -- {Monomer: 'Secondary structure'} |
me@262
|
147 Code Secondary structure |
me@262
|
148 H alpha-helix |
me@262
|
149 B Isolated beta-bridge residue |
me@262
|
150 E Strand |
me@262
|
151 G 3-10 helix |
me@262
|
152 I pi-helix |
me@262
|
153 T Turn |
me@262
|
154 S Bend |
me@262
|
155 - Other |
me@262
|
156 |
me@262
|
157 |
me@262
|
158 ?TODO: global pdb_structures |
bnagaev@243
|
159 """ |
me@262
|
160 def __init__(self, monomers=None, name='', description=""): |
me@262
|
161 if not monomers: |
me@262
|
162 monomers = [] |
me@262
|
163 self.name = name |
me@262
|
164 self.description = description |
me@262
|
165 self.monomers = monomers |
me@262
|
166 self.pdb_chains = [] |
me@262
|
167 self.pdb_files = {} |
me@262
|
168 self.pdb_residues = {} |
me@262
|
169 self.pdb_secstr = {} |
me@262
|
170 |
me@262
|
171 def __len__(self): |
me@262
|
172 return len(self.monomers) |
me@262
|
173 |
me@262
|
174 def __str__(self): |
me@262
|
175 """ Returns sequence in one-letter code """ |
me@262
|
176 return ''.join([monomer.type.code1 for monomer in self.monomers]) |
me@262
|
177 |
me@262
|
178 def __eq__(self, other): |
me@262
|
179 """ Returns if all corresponding monomers of this sequences are equal |
me@262
|
180 |
me@262
|
181 If lengths of sequences are not equal, returns False |
me@262
|
182 """ |
me@262
|
183 return len(self) == len(other) and \ |
me@262
|
184 all([a==b for a, b in zip(self.monomers, other.monomers)]) |
me@262
|
185 |
me@262
|
186 def __ne__(self, other): |
me@262
|
187 return not (self == other) |
me@262
|
188 |
me@262
|
189 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain, pdb_model=0): |
me@262
|
190 """ Reads Pdb chain from file |
me@262
|
191 |
me@262
|
192 and align each Monomer with PDB.Residue (TODO) |
me@262
|
193 """ |
me@262
|
194 name = std_id(pdb_id, pdb_chain, pdb_model) |
me@262
|
195 structure = get_structure(pdb_file, name) |
me@262
|
196 chain = structure[pdb_model][pdb_chain] |
me@262
|
197 self.pdb_chains.append(chain) |
me@262
|
198 self.pdb_residues[chain] = {} |
me@262
|
199 self.pdb_secstr[chain] = {} |
me@262
|
200 pdb_sequence = Sequence.from_pdb_chain(chain) |
me@262
|
201 a = alignment.Alignment.from_sequences(self, pdb_sequence) |
me@262
|
202 a.muscle_align() |
me@262
|
203 for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self): |
me@262
|
204 if pdb_sequence.pdb_has(chain, pdb_monomer): |
me@262
|
205 residue = pdb_sequence.pdb_residues[chain][pdb_monomer] |
me@262
|
206 self.pdb_residues[chain][monomer] = residue |
me@262
|
207 self.pdb_files[chain] = pdb_file |
me@262
|
208 |
me@262
|
209 def pdb_unload(self): |
me@262
|
210 """ Delete all pdb-connected links """ |
me@262
|
211 #~ gc.get_referrers(self.pdb_chains[0]) |
me@262
|
212 self.pdb_chains = [] |
me@262
|
213 self.pdb_residues = {} |
me@262
|
214 self.pdb_secstr = {} # FIXME |
me@262
|
215 self.pdb_files = {} # FIXME |
me@262
|
216 |
me@262
|
217 @staticmethod |
me@262
|
218 def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType): |
me@262
|
219 """ Import data from one-letter code |
me@262
|
220 |
me@262
|
221 monomer_kind is class, inherited from MonomerType |
me@262
|
222 """ |
me@262
|
223 monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str] |
me@262
|
224 return Sequence(monomers, name, description) |
me@262
|
225 |
me@262
|
226 @staticmethod |
me@262
|
227 def from_pdb_chain(chain): |
me@262
|
228 """ Returns Sequence with Monomers with link to Bio.PDB.Residue |
me@262
|
229 |
me@262
|
230 chain is Bio.PDB.Chain |
me@262
|
231 """ |
me@262
|
232 cappbuilder = CaPPBuilder() |
me@262
|
233 peptides = cappbuilder.build_peptides(chain) |
me@262
|
234 sequence = Sequence() |
me@262
|
235 sequence.pdb_chains = [chain] |
me@262
|
236 sequence.pdb_residues[chain] = {} |
me@262
|
237 sequence.pdb_secstr[chain] = {} |
me@262
|
238 for peptide in peptides: |
me@262
|
239 for ca_atom in peptide.get_ca_list(): |
me@262
|
240 residue = ca_atom.get_parent() |
me@262
|
241 monomer = AminoAcidType.from_pdb_residue(residue).instance() |
me@262
|
242 sequence.pdb_residues[chain][monomer] = residue |
me@262
|
243 sequence.monomers.append(monomer) |
me@262
|
244 return sequence |
me@262
|
245 |
me@262
|
246 def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'): |
me@262
|
247 """ Adds pdb information to each monomer |
me@262
|
248 |
me@262
|
249 Returns if information has been successfully added |
me@262
|
250 TODO: conformity_file |
me@262
|
251 |
me@262
|
252 id-format lava flow |
me@262
|
253 """ |
me@262
|
254 if not conformity_info: |
me@262
|
255 path = os.path.join(pdb_directory, self.name) |
me@262
|
256 if os.path.exists(path) and os.path.getsize(path): |
me@262
|
257 match = pdb_id_parse(self.name) |
me@262
|
258 self.pdb_chain_add(open(path), match['code'], |
me@262
|
259 match['chain'], match['model']) |
me@262
|
260 else: |
me@262
|
261 match = pdb_id_parse(self.name) |
me@262
|
262 if match: |
me@262
|
263 code = match['code'] |
me@262
|
264 pdb_filename = config.pdb_dir % code |
me@262
|
265 if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename): |
me@262
|
266 url = config.pdb_url % code |
me@262
|
267 print "Download %s" % url |
me@262
|
268 pdb_file = open(pdb_filename, 'w') |
me@262
|
269 data = urllib2.urlopen(url).read() |
me@262
|
270 pdb_file.write(data) |
me@262
|
271 pdb_file.close() |
me@262
|
272 print "Save %s" % pdb_filename |
me@262
|
273 pdb_file = open(pdb_filename) |
me@262
|
274 self.pdb_chain_add(pdb_file, code, match['chain'], match['model']) |
me@262
|
275 |
me@262
|
276 def pdb_save(self, out_filename, pdb_chain): |
me@262
|
277 """ Saves pdb_chain to out_file """ |
me@262
|
278 class GlySelect(Select): |
me@262
|
279 def accept_chain(self, chain): |
me@262
|
280 if chain == pdb_chain: |
me@262
|
281 return 1 |
me@262
|
282 else: |
me@262
|
283 return 0 |
me@262
|
284 io = PDBIO() |
me@262
|
285 structure = chain.get_parent() |
me@262
|
286 io.set_structure(structure) |
me@262
|
287 io.save(out_filename, GlySelect()) |
me@262
|
288 |
me@262
|
289 |
me@262
|
290 def pdb_add_sec_str(self, pdb_chain): |
me@262
|
291 """ Add secondary structure data """ |
me@262
|
292 tmp_file = NamedTemporaryFile(delete=False) |
me@262
|
293 tmp_file.close() |
me@262
|
294 pdb_file = self.pdb_files[pdb_chain].name |
me@262
|
295 os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name}) |
me@262
|
296 dssp, keys = make_dssp_dict(tmp_file.name) |
me@262
|
297 for monomer in self.monomers: |
me@262
|
298 if self.pdb_has(pdb_chain, monomer): |
me@262
|
299 residue = self.pdb_residues[pdb_chain][monomer] |
me@262
|
300 try: |
me@262
|
301 d = dssp[(pdb_chain.get_id(), residue.get_id())] |
me@262
|
302 self.pdb_secstr[pdb_chain][monomer] = d[1] |
me@262
|
303 except: |
me@262
|
304 print "No dssp information about %s at %s" % (monomer, pdb_chain) |
me@262
|
305 os.unlink(tmp_file.name) |
me@262
|
306 |
me@262
|
307 def pdb_has(self, chain, monomer): |
me@262
|
308 return chain in self.pdb_residues and monomer in self.pdb_residues[chain] |
me@262
|
309 |
me@262
|
310 def secstr_has(self, chain, monomer): |
me@262
|
311 return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain] |
me@262
|
312 |
me@262
|
313 @staticmethod |
me@262
|
314 def file_slice(file, n_from, n_to, fasta_name='', name='', description='', monomer_kind=AminoAcidType): |
me@262
|
315 """ Build and return sequence, consisting of part of sequence from file |
me@262
|
316 |
me@262
|
317 Does not control gaps |
me@262
|
318 """ |
me@262
|
319 inside = False |
me@262
|
320 number_used = 0 |
me@262
|
321 s = '' |
me@262
|
322 for line in file: |
me@262
|
323 line = line.split() |
me@262
|
324 if not inside: |
me@262
|
325 if line.startswith('>%s' % fasta_name): |
me@262
|
326 inside = True |
me@262
|
327 else: |
me@262
|
328 n = len(line) |
me@262
|
329 s += line[(n_from - number_user):(n_to - number_user)] |
me@262
|
330 return Sequence.from_str(s, name, description, monomer_kind) |
bnagaev@243
|
331 |
bnagaev@249
|
332 class Alignment(dict): |
bnagaev@249
|
333 """ Alignment |
bnagaev@249
|
334 |
bnagaev@249
|
335 {<Sequence object>:[<Monomer object>,None,<Monomer object>]} |
me@266
|
336 |
bnagaev@249
|
337 keys are the Sequence objects, values are the lists, which |
bnagaev@249
|
338 contain monomers of those sequences or None for gaps in the |
bnagaev@249
|
339 corresponding sequence of alignment |
bnagaev@249
|
340 """ |
bnagaev@249
|
341 # _sequences -- list of Sequence objects. Sequences don't contain gaps |
bnagaev@249
|
342 # - see sequence.py module |
bnagaev@249
|
343 |
bnagaev@249
|
344 def __init__(self, *args): |
bnagaev@249
|
345 """overloaded constructor |
bnagaev@249
|
346 |
bnagaev@249
|
347 Alignment() -> new empty Alignment |
bnagaev@249
|
348 Alignment(sequences, body) -> new Alignment with sequences and |
bnagaev@249
|
349 body initialized from arguments |
bnagaev@249
|
350 Alignment(fasta_file) -> new Alignment, read body and sequences |
bnagaev@249
|
351 from fasta file |
bnagaev@249
|
352 |
bnagaev@249
|
353 """ |
bnagaev@249
|
354 if len(args)>1:#overloaded constructor |
bnagaev@249
|
355 self.sequences=args[0] |
bnagaev@249
|
356 self.body=args[1] |
bnagaev@249
|
357 elif len(args)==0: |
bnagaev@249
|
358 self.sequences=[] |
bnagaev@249
|
359 self.body={} |
bnagaev@249
|
360 else: |
bnagaev@249
|
361 self.sequences, self.body = Alignment.from_fasta(args[0]) |
bnagaev@249
|
362 |
bnagaev@249
|
363 def length(self): |
bnagaev@249
|
364 """ Returns width, ie length of each sequence with gaps """ |
bnagaev@249
|
365 return max([len(line) for line in self.body.values()]) |
bnagaev@249
|
366 |
bnagaev@249
|
367 def height(self): |
bnagaev@249
|
368 """ The number of sequences in alignment (it's thickness). """ |
bnagaev@249
|
369 return len(self.body) |
bnagaev@249
|
370 |
bnagaev@249
|
371 def identity(self): |
bnagaev@249
|
372 """ Calculate the identity of alignment positions for colouring. |
bnagaev@249
|
373 |
bnagaev@249
|
374 For every (row, column) in alignment the percentage of the exactly |
bnagaev@249
|
375 same residue in the same column in the alignment is calculated. |
bnagaev@249
|
376 The data structure is just like the Alignment.body, but istead of |
bnagaev@249
|
377 monomers it contains float percentages. |
bnagaev@249
|
378 """ |
bnagaev@249
|
379 # Oh, God, that's awful! Absolutely not understandable. |
bnagaev@249
|
380 # First, calculate percentages of amino acids in every column |
bnagaev@249
|
381 contribution = 1.0 / len(self.sequences) |
bnagaev@249
|
382 all_columns = [] |
bnagaev@249
|
383 for position in range(len(self)): |
bnagaev@249
|
384 column_percentage = {} |
bnagaev@249
|
385 for seq in self.body: |
bnagaev@249
|
386 if self.body[seq][position] is not None: |
bnagaev@249
|
387 aa = self.body[seq][position].code |
bnagaev@249
|
388 else: |
bnagaev@249
|
389 aa = None |
bnagaev@249
|
390 if aa in allpy.data.amino_acids: |
bnagaev@249
|
391 if aa in column_percentage.keys(): |
bnagaev@249
|
392 column_percentage[aa] += contribution |
bnagaev@249
|
393 else: |
bnagaev@249
|
394 column_percentage[aa] = contribution |
bnagaev@249
|
395 all_columns.append(column_percentage) |
bnagaev@249
|
396 # Second, map these percentages onto the alignment |
bnagaev@249
|
397 self.identity_percentages = {} |
bnagaev@249
|
398 for seq in self.sequences: |
bnagaev@249
|
399 self.identity_percentages[seq] = [] |
bnagaev@249
|
400 for seq in self.identity_percentages: |
bnagaev@249
|
401 line = self.identity_percentages[seq] |
bnagaev@249
|
402 for position in range(len(self)): |
bnagaev@249
|
403 if self.body[seq][position] is not None: |
bnagaev@249
|
404 aa = self.body[seq][position].code |
bnagaev@249
|
405 else: |
bnagaev@249
|
406 aa = None |
bnagaev@249
|
407 line.append(all_columns[position].get(aa)) |
bnagaev@249
|
408 return self.identity_percentages |
bnagaev@249
|
409 |
bnagaev@249
|
410 @staticmethod |
bnagaev@249
|
411 def from_fasta(file, monomer_kind=AminoAcidType): |
bnagaev@249
|
412 """ Import data from fasta file |
bnagaev@249
|
413 |
bnagaev@249
|
414 monomer_kind is class, inherited from MonomerType |
bnagaev@249
|
415 |
bnagaev@249
|
416 >>> import alignment |
bnagaev@249
|
417 >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta")) |
bnagaev@249
|
418 """ |
bnagaev@249
|
419 import re |
bnagaev@249
|
420 |
bnagaev@249
|
421 sequences = [] |
bnagaev@249
|
422 body = {} |
bnagaev@249
|
423 |
bnagaev@249
|
424 raw_sequences = file.read().split(">") |
bnagaev@249
|
425 if len(raw_sequences) <= 1: |
bnagaev@249
|
426 raise Exception("Wrong format of fasta-file %s" % file.name) |
bnagaev@249
|
427 |
bnagaev@249
|
428 raw_sequences = raw_sequences[1:] #ignore everything before the first > |
bnagaev@249
|
429 for raw in raw_sequences: |
bnagaev@249
|
430 parsed_raw_sequence = raw.split("\n") |
bnagaev@249
|
431 parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence] |
bnagaev@249
|
432 name_and_description = parsed_raw_sequence[0] |
bnagaev@249
|
433 name_and_description = name_and_description.split(" ",1) |
bnagaev@249
|
434 if len(name_and_description) == 2: |
bnagaev@249
|
435 name, description = name_and_description |
bnagaev@249
|
436 elif len(name_and_description) == 1: |
bnagaev@249
|
437 #if there is description |
bnagaev@249
|
438 name = name_and_description[0] |
bnagaev@249
|
439 description = '' |
bnagaev@249
|
440 else: |
bnagaev@249
|
441 raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \ |
bnagaev@249
|
442 {'name': name, 'file': file.name}) |
bnagaev@249
|
443 |
bnagaev@249
|
444 if len(parsed_raw_sequence) <= 1: |
bnagaev@249
|
445 raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \ |
bnagaev@249
|
446 {'name': name, 'file': file.name}) |
bnagaev@249
|
447 string = "" |
bnagaev@249
|
448 for piece in parsed_raw_sequence[1:]: |
bnagaev@249
|
449 piece_without_whitespace_chars = re.sub("\s", "", piece) |
bnagaev@249
|
450 string += piece_without_whitespace_chars |
bnagaev@249
|
451 monomers = [] #convert into Monomer objects |
bnagaev@249
|
452 body_list = [] #create the respective list in body dict |
bnagaev@249
|
453 for current_monomer in string: |
bnagaev@249
|
454 if current_monomer not in ["-", ".", "~"]: |
bnagaev@249
|
455 monomers.append(monomer_kind.from_code1(current_monomer).instance()) |
bnagaev@249
|
456 body_list.append(monomers[-1]) |
bnagaev@249
|
457 else: |
bnagaev@249
|
458 body_list.append(None) |
bnagaev@249
|
459 s = sequence.Sequence(monomers, name, description) |
bnagaev@249
|
460 sequences.append(s) |
bnagaev@249
|
461 body[s] = body_list |
bnagaev@249
|
462 return sequences, body |
bnagaev@249
|
463 |
bnagaev@249
|
464 @staticmethod |
bnagaev@249
|
465 def from_sequences(*sequences): |
bnagaev@249
|
466 """ Constructs new alignment from sequences |
bnagaev@249
|
467 |
bnagaev@249
|
468 Add None's to right end to make equal lengthes of alignment sequences |
bnagaev@249
|
469 """ |
bnagaev@249
|
470 alignment = Alignment() |
bnagaev@249
|
471 alignment.sequences = sequences |
bnagaev@249
|
472 max_length = max(len(sequence) for sequence in sequences) |
bnagaev@249
|
473 for sequence in sequences: |
bnagaev@249
|
474 gaps_count = max_length - len(sequence) |
bnagaev@249
|
475 alignment.body[sequence] = sequence.monomers + [None] * gaps_count |
bnagaev@249
|
476 return alignment |
bnagaev@249
|
477 |
bnagaev@249
|
478 def save_fasta(self, out_file, long_line=70, gap='-'): |
bnagaev@249
|
479 """ Saves alignment to given file |
bnagaev@249
|
480 |
bnagaev@249
|
481 Splits long lines to substrings of length=long_line |
bnagaev@249
|
482 To prevent this, set long_line=None |
bnagaev@249
|
483 """ |
bnagaev@249
|
484 block.Block(self).save_fasta(out_file, long_line=long_line, gap=gap) |
bnagaev@249
|
485 |
bnagaev@249
|
486 def muscle_align(self): |
bnagaev@249
|
487 """ Simple align ths alignment using sequences (muscle) |
bnagaev@249
|
488 |
bnagaev@249
|
489 uses old Monomers and Sequences objects |
bnagaev@249
|
490 """ |
bnagaev@249
|
491 tmp_file = NamedTemporaryFile(delete=False) |
bnagaev@249
|
492 self.save_fasta(tmp_file) |
bnagaev@249
|
493 tmp_file.close() |
bnagaev@249
|
494 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_file.name}) |
bnagaev@249
|
495 sequences, body = Alignment.from_fasta(open(tmp_file.name)) |
bnagaev@249
|
496 for sequence in self.sequences: |
bnagaev@249
|
497 try: |
bnagaev@249
|
498 new_sequence = [i for i in sequences if sequence==i][0] |
bnagaev@249
|
499 except: |
bnagaev@249
|
500 raise Exception("Align: Cann't find sequence %s in muscle output" % \ |
bnagaev@249
|
501 sequence.name) |
bnagaev@249
|
502 old_monomers = iter(sequence.monomers) |
bnagaev@249
|
503 self.body[sequence] = [] |
bnagaev@249
|
504 for monomer in body[new_sequence]: |
bnagaev@249
|
505 if not monomer: |
bnagaev@249
|
506 self.body[sequence].append(monomer) |
bnagaev@249
|
507 else: |
bnagaev@249
|
508 old_monomer = old_monomers.next() |
bnagaev@249
|
509 if monomer != old_monomer: |
bnagaev@249
|
510 raise Exception("Align: alignment errors") |
bnagaev@249
|
511 self.body[sequence].append(old_monomer) |
bnagaev@249
|
512 os.unlink(tmp_file.name) |
bnagaev@249
|
513 |
bnagaev@249
|
514 def column(self, sequence=None, sequences=None, original=None): |
bnagaev@249
|
515 """ returns list of columns of alignment |
bnagaev@249
|
516 |
bnagaev@249
|
517 sequence or sequences: |
bnagaev@249
|
518 if sequence is given, then column is (original_monomer, monomer) |
bnagaev@249
|
519 if sequences is given, then column is (original_monomer, {sequence: monomer}) |
bnagaev@249
|
520 if both of them are given, it is an error |
bnagaev@249
|
521 original (Sequence type): |
bnagaev@249
|
522 if given, this filters only columns represented by original sequence |
bnagaev@249
|
523 """ |
bnagaev@249
|
524 if sequence and sequences: |
bnagaev@249
|
525 raise Exception("Wrong usage. read help") |
bnagaev@249
|
526 indexes = dict([(v, k) for( k, v) in enumerate(self.sequences)]) |
bnagaev@249
|
527 alignment = self.body.items() |
bnagaev@249
|
528 alignment.sort(key=lambda i: indexes[i[0]]) |
bnagaev@249
|
529 alignment = [monomers for seq, monomers in alignment] |
bnagaev@249
|
530 for column in zip(*alignment): |
bnagaev@249
|
531 if not original or column[indexes[original]]: |
bnagaev@249
|
532 if sequence: |
bnagaev@249
|
533 yield (column[indexes[original]], column[indexes[sequence]]) |
bnagaev@249
|
534 else: |
bnagaev@249
|
535 yield (column[indexes[original]], |
bnagaev@249
|
536 dict([(s, column[indexes[s]]) for s in sequences])) |
bnagaev@249
|
537 |
bnagaev@249
|
538 def secstr(self, sequence, pdb_chain, gap='-'): |
bnagaev@249
|
539 """ Returns string representing secondary structure """ |
bnagaev@249
|
540 return ''.join([ |
bnagaev@249
|
541 (sequence.pdb_secstr[pdb_chain][m] if sequence.secstr_has(pdb_chain, m) else gap) |
bnagaev@249
|
542 for m in self.body[sequence]]) |
bnagaev@249
|
543 |
bnagaev@249
|
544 class Block(object): |
me@261
|
545 """ Block of alignment |
me@261
|
546 |
me@261
|
547 Mandatory data: |
me@266
|
548 |
me@261
|
549 * self.alignment -- alignment object, which the block belongs to |
me@261
|
550 * self.sequences - set of sequence objects that contain monomers |
me@261
|
551 and/or gaps, that constitute the block |
me@261
|
552 * self.positions -- list of positions of the alignment.body that |
me@261
|
553 are included in the block; position[i+1] is always to the right from position[i] |
me@261
|
554 |
me@261
|
555 Don't change self.sequences -- it may be a link to other block.sequences |
me@261
|
556 |
me@261
|
557 How to create a new block: |
me@261
|
558 >>> import alignment |
me@261
|
559 >>> import block |
me@261
|
560 >>> proj = alignment.Alignment(open("test.fasta")) |
me@261
|
561 >>> block1 = block.Block(proj) |
me@261
|
562 """ |
me@261
|
563 |
me@261
|
564 def __init__(self, alignment, sequences=None, positions=None): |
me@261
|
565 """ Builds new block from alignment |
me@261
|
566 |
me@261
|
567 if sequences==None, all sequences are used |
me@261
|
568 if positions==None, all positions are used |
me@261
|
569 """ |
me@261
|
570 if sequences == None: |
me@261
|
571 sequences = set(alignment.sequences) # copy |
me@261
|
572 if positions == None: |
me@261
|
573 positions = range(len(alignment)) |
me@261
|
574 self.alignment = alignment |
me@261
|
575 self.sequences = sequences |
me@261
|
576 self.positions = positions |
me@261
|
577 |
me@261
|
578 def save_fasta(self, out_file, long_line=70, gap='-'): |
me@261
|
579 """ Saves alignment to given file in fasta-format |
me@261
|
580 |
me@261
|
581 No changes in the names, descriptions or order of the sequences |
me@261
|
582 are made. |
me@261
|
583 """ |
me@261
|
584 for sequence in self.sequences: |
me@261
|
585 alignment_monomers = self.alignment.body[sequence] |
me@261
|
586 block_monomers = [alignment_monomers[i] for i in self.positions] |
me@261
|
587 string = ''.join([m.type.code1 if m else '-' for m in block_monomers]) |
me@261
|
588 save_fasta(out_file, string, sequence.name, sequence.description, long_line) |
me@261
|
589 |
me@261
|
590 def geometrical_cores(self, max_delta=config.delta, |
me@261
|
591 timeout=config.timeout, minsize=config.minsize, |
me@261
|
592 ac_new_atoms=config.ac_new_atoms, |
me@261
|
593 ac_count=config.ac_count): |
me@261
|
594 """ Returns length-sorted list of blocks, representing GCs |
me@261
|
595 |
me@261
|
596 max_delta -- threshold of distance spreading |
me@261
|
597 timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) |
me@261
|
598 minsize -- min size of each core |
me@261
|
599 ac_new_atoms -- min part or new atoms in new alternative core |
me@261
|
600 current GC is compared with each of already selected GCs |
me@261
|
601 if difference is less then ac_new_atoms, current GC is skipped |
me@261
|
602 difference = part of new atoms in current core |
me@261
|
603 ac_count -- max number of cores (including main core) |
me@261
|
604 -1 means infinity |
me@261
|
605 If more than one pdb chain for some sequence provided, consider all of them |
me@261
|
606 cost is calculated as 1 / (delta + 1) |
me@261
|
607 delta in [0, +inf) => cost in (0, 1] |
me@261
|
608 """ |
me@261
|
609 nodes = self.positions |
me@261
|
610 lines = {} |
me@261
|
611 for i in self.positions: |
me@261
|
612 for j in self.positions: |
me@261
|
613 if i < j: |
me@261
|
614 distances = [] |
me@261
|
615 for sequence in self.sequences: |
me@261
|
616 for chain in sequence.pdb_chains: |
me@261
|
617 m1 = self.alignment.body[sequence][i] |
me@261
|
618 m2 = self.alignment.body[sequence][j] |
me@261
|
619 if m1 and m2: |
me@261
|
620 r1 = sequence.pdb_residues[chain][m1] |
me@261
|
621 r2 = sequence.pdb_residues[chain][m2] |
me@261
|
622 ca1 = r1['CA'] |
me@261
|
623 ca2 = r2['CA'] |
me@261
|
624 d = ca1 - ca2 # Bio.PDB feature |
me@261
|
625 distances.append(d) |
me@261
|
626 if len(distances) >= 2: |
me@261
|
627 delta = max(distances) - min(distances) |
me@261
|
628 if delta <= max_delta: |
me@261
|
629 lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta) |
me@261
|
630 graph = Graph(nodes, lines) |
me@261
|
631 cliques = graph.cliques(timeout=timeout, minsize=minsize) |
me@261
|
632 GCs = [] |
me@261
|
633 for clique in cliques: |
me@261
|
634 for GC in GCs: |
me@261
|
635 if len(clique - set(GC.positions)) < ac_new_atoms * len(clique): |
me@261
|
636 break |
me@261
|
637 else: |
me@261
|
638 GCs.append(Block(self.alignment, self.sequences, clique)) |
me@261
|
639 if ac_count != -1 and len(GCs) >= ac_count: |
me@261
|
640 break |
me@261
|
641 return GCs |
me@261
|
642 |
me@261
|
643 def xstring(self, x='X', gap='-'): |
me@261
|
644 """ Returns string consisting of gap chars and chars x at self.positions |
me@261
|
645 |
me@261
|
646 Length of returning string = length of alignment |
me@261
|
647 """ |
me@261
|
648 monomers = [False] * len(self.alignment) |
me@261
|
649 for i in self.positions: |
me@261
|
650 monomers[i] = True |
me@261
|
651 return ''.join([x if m else gap for m in monomers]) |
me@261
|
652 |
me@261
|
653 def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70): |
me@261
|
654 """ Save xstring and name in fasta format """ |
me@261
|
655 save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line) |
me@261
|
656 |
me@261
|
657 def monomers(self, sequence): |
me@261
|
658 """ Iterates monomers of this sequence from this block """ |
me@261
|
659 alignment_sequence = self.alignment.body[sequence] |
me@261
|
660 return (alignment_sequence[i] for i in self.positions) |
me@261
|
661 |
me@261
|
662 def ca_atoms(self, sequence, pdb_chain): |
me@261
|
663 """ Iterates Ca-atom of monomers of this sequence from this block """ |
me@261
|
664 return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers()) |
me@261
|
665 |
me@261
|
666 def sequences_chains(self): |
me@261
|
667 """ Iterates pairs (sequence, chain) """ |
me@261
|
668 for sequence in self.alignment.sequences: |
me@261
|
669 if sequence in self.sequences: |
me@261
|
670 for chain in sequence.pdb_chains: |
me@261
|
671 yield (sequence, chain) |
me@261
|
672 |
me@261
|
673 def superimpose(self): |
me@261
|
674 """ Superimpose all pdb_chains in this block """ |
me@261
|
675 sequences_chains = list(self.sequences_chains()) |
me@261
|
676 if len(sequences_chains) >= 1: |
me@261
|
677 sup = Superimposer() |
me@261
|
678 fixed_sequence, fixed_chain = sequences_chains.pop() |
me@261
|
679 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain) |
me@261
|
680 for sequence, chain in sequences_chains: |
me@261
|
681 moving_atoms = self.ca_atoms(sequence, chain) |
me@261
|
682 sup.set_atoms(fixed_atoms, moving_atoms) |
me@261
|
683 # Apply rotation/translation to the moving atoms |
me@261
|
684 sup.apply(moving_atoms) |
me@261
|
685 |
me@261
|
686 def pdb_save(self, out_file): |
me@261
|
687 """ Save all sequences |
me@261
|
688 |
me@261
|
689 Returns {(sequence, chain): CHAIN} |
me@261
|
690 CHAIN is chain letter in new file |
me@261
|
691 """ |
me@261
|
692 tmp_file = NamedTemporaryFile(delete=False) |
me@261
|
693 tmp_file.close() |
me@261
|
694 |
me@261
|
695 for sequence, chain in self.sequences_chains(): |
me@261
|
696 sequence.pdb_save(tmp_file.name, chain) |
me@261
|
697 # TODO: read from tmp_file.name |
me@261
|
698 # change CHAIN |
me@261
|
699 # add to out_file |
me@261
|
700 |
me@261
|
701 os.unlink(NamedTemporaryFile) |
bnagaev@239
|
702 |
me@260
|
703 # vim: set ts=4 sts=4 sw=4 et: |