rev |
line source |
me@261
|
1 import sys |
me@261
|
2 |
me@315
|
3 import util |
me@284
|
4 import fasta |
me@260
|
5 import data.codes |
me@260
|
6 |
me@306
|
7 default_gaps = set((".", "-", "~")) |
me@306
|
8 """Set of characters to recoginze as gaps when parsing alignment.""" |
me@306
|
9 |
me@260
|
10 class MonomerType(object): |
me@260
|
11 """Class of monomer types. |
me@260
|
12 |
me@260
|
13 Each MonomerType object represents a known monomer type, e.g. Valine, |
me@260
|
14 and is referenced to by each instance of monomer in a given sequence. |
me@260
|
15 |
me@260
|
16 - `name`: full name of monomer type |
me@260
|
17 - `code1`: one-letter code |
me@260
|
18 - `code3`: three-letter code |
me@260
|
19 - `is_modified`: either of True or False |
me@260
|
20 |
me@260
|
21 class atributes: |
me@260
|
22 |
me@260
|
23 - `by_code1`: a mapping from one-letter code to MonomerType object |
me@260
|
24 - `by_code3`: a mapping from three-letter code to MonomerType object |
me@260
|
25 - `by_name`: a mapping from monomer name to MonomerType object |
me@260
|
26 - `instance_type`: class of Monomer objects to use when creating new |
me@260
|
27 objects; this must be redefined in descendent classes |
me@260
|
28 |
me@260
|
29 All of the class attributes MUST be redefined when subclassing. |
me@260
|
30 """ |
me@260
|
31 |
me@260
|
32 by_code1 = {} |
me@260
|
33 by_code3 = {} |
me@260
|
34 by_name = {} |
me@260
|
35 instance_type = None |
me@260
|
36 |
me@260
|
37 def __init__(self, name="", code1="", code3="", is_modified=False): |
me@310
|
38 super(MonomerType, self).__init__() |
me@260
|
39 self.name = name.capitalize() |
me@260
|
40 self.code1 = code1.upper() |
me@260
|
41 self.code3 = code3.upper() |
me@260
|
42 self.is_modified = bool(is_modified) |
me@260
|
43 if not is_modified: |
me@260
|
44 self.by_code1[self.code1] = self |
me@260
|
45 self.by_code3[code3] = self |
me@260
|
46 self.by_name[name] = self |
me@260
|
47 # We duplicate distinguished long names into MonomerType itself, |
me@260
|
48 # so that we can use MonomerType.from_code3 to create the relevant |
me@260
|
49 # type of monomer. |
me@260
|
50 MonomerType.by_code3[code3] = self |
me@260
|
51 MonomerType.by_name[name] = self |
me@260
|
52 |
me@260
|
53 @classmethod |
me@260
|
54 def _initialize(cls, type_letter, codes=data.codes.codes): |
me@260
|
55 """Create all relevant instances of MonomerType. |
me@260
|
56 |
me@260
|
57 `type_letter` is either of: |
me@260
|
58 |
me@260
|
59 - 'p' for protein |
me@260
|
60 - 'd' for DNA |
me@260
|
61 - 'r' for RNA |
me@260
|
62 |
me@260
|
63 `codes` is a table of monomer codes |
me@260
|
64 """ |
me@260
|
65 for type, code1, is_modified, code3, name in codes: |
me@260
|
66 if type == type_letter: |
me@260
|
67 cls(name, code1, code3, is_modified) |
me@260
|
68 |
me@260
|
69 @classmethod |
me@260
|
70 def from_code1(cls, code1): |
me@260
|
71 """Return monomer type by one-letter code.""" |
me@260
|
72 return cls.by_code1[code1.upper()] |
me@260
|
73 |
me@260
|
74 @classmethod |
me@260
|
75 def from_code3(cls, code3): |
me@260
|
76 """Return monomer type by three-letter code.""" |
me@260
|
77 return cls.by_code3[code3.upper()] |
me@260
|
78 |
me@260
|
79 @classmethod |
me@260
|
80 def from_name(cls, name): |
me@260
|
81 """Return monomer type by name.""" |
me@260
|
82 return cls.by_name[name.capitalize()] |
me@260
|
83 |
me@260
|
84 def instance(self): |
me@260
|
85 """Create a new monomer of given type.""" |
me@260
|
86 return self.instance_type(self) |
me@260
|
87 |
me@260
|
88 def __eq__(self, other): |
me@260
|
89 if hasattr(other, "type"): |
me@260
|
90 return self is other.type |
me@260
|
91 return self is other |
me@260
|
92 |
me@260
|
93 class Monomer(object): |
me@260
|
94 """Monomer object. |
me@260
|
95 |
me@260
|
96 attributes: |
me@260
|
97 |
me@260
|
98 - `type`: type of monomer (a MonomerType object) |
me@260
|
99 |
me@282
|
100 class attributes: |
me@282
|
101 |
me@282
|
102 - `monomer_type`: either MonomerType or one of it's subclasses, it is used |
me@282
|
103 when creating new monomers. It SHOULD be redefined when subclassing |
me@282
|
104 Monomer. |
me@260
|
105 """ |
me@260
|
106 monomer_type = MonomerType |
me@260
|
107 |
me@260
|
108 def __init__(self, type): |
me@310
|
109 super(Monomer, self).__init__() |
me@260
|
110 self.type = type |
me@260
|
111 |
me@260
|
112 @classmethod |
me@260
|
113 def from_code1(cls, code1): |
me@260
|
114 return cls(cls.monomer_type.by_code1[code1.upper()]) |
me@260
|
115 |
me@260
|
116 @classmethod |
me@260
|
117 def from_code3(cls, code3): |
me@260
|
118 return cls(cls.monomer_type.by_code3[code3.upper()]) |
me@260
|
119 |
me@260
|
120 @classmethod |
me@260
|
121 def from_name(cls, name): |
me@260
|
122 return cls(cls.monomer_type.by_name[name.capitalize()]) |
me@260
|
123 |
me@322
|
124 @property |
me@322
|
125 def code1(self): |
me@322
|
126 return self.type.code1 |
me@322
|
127 |
me@322
|
128 @property |
me@322
|
129 def code3(self): |
me@322
|
130 return self.type.code3 |
me@322
|
131 |
me@322
|
132 @property |
me@322
|
133 def name(self): |
me@322
|
134 return self.type.name |
me@322
|
135 |
me@322
|
136 @property |
me@322
|
137 def is_modified(self): |
me@322
|
138 return self.type.is_modified |
me@322
|
139 |
me@260
|
140 def __eq__(self, other): |
me@260
|
141 if hasattr(other, "type"): |
me@260
|
142 return self.type is other.type |
me@260
|
143 return self.type is other |
bnagaev@239
|
144 |
bnagaev@239
|
145 class Sequence(list): |
me@274
|
146 """Sequence of Monomers. |
bnagaev@243
|
147 |
me@274
|
148 This behaves like list of monomer objects. In addition to standard list |
me@274
|
149 behaviour, Sequence has the following attributes: |
me@270
|
150 |
me@274
|
151 * name -- str with the name of the sequence |
me@274
|
152 * description -- str with description of the sequence |
me@274
|
153 * source -- str denoting source of the sequence |
me@266
|
154 |
me@274
|
155 Any of them may be empty (i.e. hold empty string) |
me@275
|
156 |
me@275
|
157 Class attributes: |
me@282
|
158 |
me@275
|
159 * monomer_type -- type of monomers in sequence, must be redefined when |
me@275
|
160 subclassing |
me@274
|
161 """ |
me@270
|
162 |
me@275
|
163 monomer_type = Monomer |
me@270
|
164 |
me@275
|
165 name = '' |
me@275
|
166 description = '' |
me@275
|
167 source = '' |
me@275
|
168 |
me@275
|
169 def __init__(self, sequence=[], name=None, description=None, source=None): |
me@275
|
170 super(Sequence, self).__init__(sequence) |
me@275
|
171 if hasattr(sequence, 'name'): |
me@275
|
172 vars(self).update(vars(sequence)) |
me@275
|
173 if name: |
me@275
|
174 self.name = name |
me@275
|
175 if description: |
me@275
|
176 self.description = description |
me@275
|
177 if source: |
me@275
|
178 self.source = source |
me@270
|
179 |
me@262
|
180 def __str__(self): |
me@275
|
181 """Returns sequence in one-letter code.""" |
me@275
|
182 return ''.join(monomer.code1 for monomer in self) |
me@270
|
183 |
me@316
|
184 def __hash__(self): |
me@316
|
185 """Hash sequence by identity.""" |
me@316
|
186 return id(self) |
me@316
|
187 |
me@273
|
188 @classmethod |
me@321
|
189 def from_string(cls, string, name='', description='', source=''): |
me@273
|
190 """Create sequences from string of one-letter codes.""" |
me@273
|
191 monomer = cls.monomer_type.from_code1 |
me@273
|
192 monomers = [monomer(letter) for letter in string] |
me@321
|
193 return cls(monomers, name, description, source) |
me@262
|
194 |
me@284
|
195 @classmethod |
me@284
|
196 def from_fasta(cls, file): |
me@284
|
197 """Read sequence from FASTA file. |
me@286
|
198 |
me@284
|
199 File must contain exactly one sequence. |
me@284
|
200 """ |
me@313
|
201 sequence, = fasta.parse_file(file) |
me@313
|
202 name, description, body = sequence |
me@321
|
203 return cls.from_string(body, name, description, file.name) |
me@284
|
204 |
me@295
|
205 class Alignment(object): |
me@295
|
206 """Alignment. It is a list of Columns.""" |
bnagaev@249
|
207 |
me@287
|
208 sequence_type = Sequence |
me@289
|
209 """Type of sequences in alignment. SHOULD be redefined when subclassing.""" |
me@288
|
210 |
me@289
|
211 sequences = None |
me@289
|
212 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!""" |
bnagaev@249
|
213 |
me@287
|
214 def __init__(self): |
me@287
|
215 """Initialize empty alignment.""" |
me@287
|
216 super(Alignment, self).__init__() |
me@287
|
217 self.sequences = [] |
me@295
|
218 self.columns = [] |
me@282
|
219 |
me@299
|
220 # Alignment modification methods |
me@299
|
221 # ============================== |
me@299
|
222 |
me@294
|
223 def append_sequence(self, sequence): |
me@294
|
224 """Add sequence to alignment. |
me@294
|
225 |
me@294
|
226 If sequence is too short, pad it with gaps on the right. |
me@294
|
227 """ |
me@294
|
228 self.sequences.append(sequence) |
me@294
|
229 for i, monomer in enumerate(sequence): |
me@302
|
230 self.column_at(i)[sequence] = monomer |
me@294
|
231 |
me@306
|
232 def append_gapped_line(self, line, name='', description='', source='', |
me@306
|
233 gaps=default_gaps): |
me@287
|
234 """Add row from a line of one-letter codes and gaps.""" |
me@313
|
235 Sequence = self.sequence_type |
me@306
|
236 not_gap = lambda (i, char): char not in gaps |
me@306
|
237 without_gaps = util.remove_each(line, gaps) |
me@321
|
238 sequence = Sequence.from_string(without_gaps, name, description, source) |
me@303
|
239 # The following line has some simple magic: |
me@303
|
240 # 1. attach natural numbers to monomers |
me@303
|
241 # 2. delete gaps |
me@303
|
242 # 3. attach numbers again |
me@303
|
243 # This way we have a pair of numbers attached to monomer: |
me@303
|
244 # - it's position in alignment (the first attached number, j) |
me@303
|
245 # - it's position in sequence (the second attached number, i) |
me@287
|
246 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))): |
me@313
|
247 self.column_at(j)[sequence] = sequence[i] |
me@287
|
248 self.sequences.append(sequence) |
me@287
|
249 |
me@302
|
250 def column_at(self, n): |
me@302
|
251 """Return column by index. Create required new columns if required. |
me@302
|
252 |
me@302
|
253 Do NOT use this method, unless you are sure it is what you want. |
me@302
|
254 """ |
me@302
|
255 for i in range(len(self.columns), n + 1): |
me@302
|
256 self.columns.append(Column()) |
me@302
|
257 return self.columns[n] |
me@302
|
258 |
me@299
|
259 # Alignment IO methods |
me@299
|
260 # ==================== |
me@299
|
261 |
me@287
|
262 @classmethod |
me@306
|
263 def from_fasta(cls, file, gaps=default_gaps): |
me@287
|
264 """Create new alignment from FASTA file.""" |
me@287
|
265 self = cls() |
me@313
|
266 for (name, description, body) in fasta.parse_file(file): |
me@321
|
267 self.append_gapped_line(body, name, description, file.name, gaps) |
me@287
|
268 return self |
bnagaev@249
|
269 |
me@292
|
270 def to_fasta(self, file): |
me@292
|
271 """Write alignment in FASTA file as sequences with gaps.""" |
me@292
|
272 def char(monomer): |
me@292
|
273 if monomer: |
me@292
|
274 return monomer.code1 |
me@292
|
275 return "-" |
me@292
|
276 for row in self.rows_as_lists(): |
me@292
|
277 seq = row.sequence |
me@292
|
278 line = "".join(map(char, row)) |
me@292
|
279 fasta.save_file(file, line, seq.name, seq.description) |
me@292
|
280 |
me@299
|
281 # Data access methods for alignment |
me@299
|
282 # ================================= |
me@299
|
283 |
me@299
|
284 def rows(self): |
me@299
|
285 """Return list of rows (temporary objects) in alignment. |
me@299
|
286 |
me@299
|
287 Each row is a dictionary of { column : monomer }. |
me@299
|
288 |
me@299
|
289 For gap positions there is no key for the column in row. |
me@299
|
290 |
me@299
|
291 Each row has attribute `sequence` pointing to the sequence the row is |
me@299
|
292 describing. |
me@299
|
293 |
me@299
|
294 Modifications of row have no effect on the alignment. |
me@299
|
295 """ |
me@299
|
296 # For now, the function returns a list rather than iterator. |
me@299
|
297 # It is yet to see, whether memory performance here becomes critical, |
me@299
|
298 # or is random access useful. |
me@299
|
299 rows = [] |
me@299
|
300 for sequence in self.sequences: |
me@299
|
301 row = util.UserDict() |
me@299
|
302 row.sequence = sequence |
me@299
|
303 for column in self.columns: |
me@299
|
304 if sequence in column: |
me@299
|
305 row[column] = column[sequence] |
me@299
|
306 rows.append(row) |
me@299
|
307 return rows |
me@299
|
308 |
me@299
|
309 def rows_as_lists(self): |
me@299
|
310 """Return list of rows (temporary objects) in alignment. |
me@299
|
311 |
me@299
|
312 Each row here is a list of either monomer or None (for gaps). |
me@299
|
313 |
me@299
|
314 Each row has attribute `sequence` pointing to the sequence of row. |
me@299
|
315 |
me@299
|
316 Modifications of row have no effect on the alignment. |
me@299
|
317 """ |
me@299
|
318 rows = [] |
me@299
|
319 for sequence in self.sequences: |
me@299
|
320 row = util.UserList() |
me@299
|
321 row.sequence = sequence |
me@299
|
322 for column in self.columns: |
me@299
|
323 row.append(column.get(sequence)) |
me@299
|
324 rows.append(row) |
me@299
|
325 return rows |
me@299
|
326 |
me@299
|
327 def columns_as_lists(self): |
me@299
|
328 """Return list of columns (temorary objects) in alignment. |
me@299
|
329 |
me@299
|
330 Each column here is a list of either monomer or None (for gaps). |
me@299
|
331 |
me@299
|
332 Items of column are sorted in the same way as alignment.sequences. |
me@299
|
333 |
me@299
|
334 Modifications of column have no effect on the alignment. |
me@299
|
335 """ |
me@299
|
336 columns = [] |
me@299
|
337 for column in self.columns: |
me@299
|
338 col = [] |
me@299
|
339 for sequence in self.sequences: |
me@299
|
340 col.append(column.get(sequence)) |
me@299
|
341 columns.append(col) |
me@299
|
342 return columns |
me@299
|
343 |
me@300
|
344 class Column(dict): |
me@300
|
345 """Column of alignment. |
me@300
|
346 |
me@300
|
347 Column is a dict of { sequence : monomer }. |
me@300
|
348 |
me@300
|
349 For sequences that have gaps in current row, given key is not present in |
me@300
|
350 the column. |
me@300
|
351 """ |
me@300
|
352 pass |
me@300
|
353 |
me@317
|
354 class Block(Alignment): |
me@307
|
355 """Block of alignment. |
me@301
|
356 |
me@307
|
357 Block is intersection of a set of columns & a set of rows. Most of blocks |
me@307
|
358 look like rectangular part of alignment if you shuffle alignment rows the |
me@307
|
359 right way. |
me@261
|
360 """ |
me@270
|
361 |
me@307
|
362 alignment = None |
me@307
|
363 """Alignment the block belongs to.""" |
me@270
|
364 |
me@307
|
365 sequences = () |
me@307
|
366 """List of sequences in block.""" |
me@307
|
367 |
me@307
|
368 columns = () |
me@307
|
369 """List of columns in block.""" |
me@307
|
370 |
me@317
|
371 @classmethod |
me@317
|
372 def from_alignment(cls, alignment, sequences=None, columns=None): |
me@307
|
373 """Build new block from alignment. |
me@307
|
374 |
me@307
|
375 If sequences are not given, the block uses all sequences in alignment. |
me@307
|
376 |
me@307
|
377 If columns are not given, the block uses all columns in alignment. |
me@307
|
378 |
me@307
|
379 In both cases we use exactly the list used in alignment, thus, if new |
me@307
|
380 sequences or columns are added to alignment, the block tracks this too. |
me@261
|
381 """ |
me@307
|
382 if sequences is None: |
me@307
|
383 sequences = alignment.sequences |
me@318
|
384 if columns is None: |
me@307
|
385 columns = alignment.columns |
me@320
|
386 block = cls() |
me@320
|
387 block.alignment = alignment |
me@320
|
388 block.sequences = sequences |
me@320
|
389 block.columns = columns |
me@320
|
390 return block |
me@270
|
391 |
me@312
|
392 def flush_left(self): |
me@312
|
393 """Move all monomers to the left, gaps to the right within block.""" |
me@312
|
394 padding = [None] * len(self.columns) |
me@312
|
395 for row in self.rows_as_lists(): |
me@312
|
396 sequence = row.sequence |
me@312
|
397 row = filter(None, row) + padding |
me@312
|
398 for monomer, column in zip(row, self.columns): |
me@312
|
399 if monomer: |
me@312
|
400 column[sequence] = monomer |
me@312
|
401 elif sequence in column: |
me@312
|
402 del column[sequence] |
me@312
|
403 |
me@312
|
404 |
me@260
|
405 # vim: set ts=4 sts=4 sw=4 et: |