Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/0e41dc9c60bc/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 21:57:39 2013
Кодировка:
allpy: 0e41dc9c60bc allpy/base.py

allpy

view allpy/base.py @ 694:0e41dc9c60bc

homology endowed with column_number in all methods and passed through tests
author Andrei <aba@belozersky.msu.ru>
date Tue, 05 Jul 2011 21:54:10 +0400
parents 0f3a4e3e2c4f
children f8518f3822b8
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 cls.by_code3[code3] = TheMonomer
56 cls.by_name[name] = TheMonomer
57 # We duplicate distinguished long names into Monomer itself, so that we
58 # can use Monomer.from_code3 to create the relevant type of monomer.
59 Monomer.by_code3[code3] = TheMonomer
60 Monomer.by_name[name] = TheMonomer
62 @classmethod
63 def _initialize(cls, codes=None):
64 """Create all relevant subclasses of Monomer."""
65 for code1, is_modified, code3, name in codes:
66 cls._subclass(name, code1, code3, is_modified)
68 @classmethod
69 def from_code1(cls, code1):
70 """Create new monomer from 1-letter code."""
71 return cls.by_code1[code1.upper()]()
73 @classmethod
74 def from_code3(cls, code3):
75 """Create new monomer from 3-letter code."""
76 return cls.by_code3[code3.upper()]()
78 @classmethod
79 def from_name(cls, name):
80 """Create new monomer from full name."""
81 return cls.by_name[name.strip().capitalize()]()
83 def __repr__(self):
84 return "<Monomer %s>" % str(self.code1)
86 def __str__(self):
87 """Returns one-letter code"""
88 return self.code1
90 def __eq__(self, other):
91 """Monomers within same monomer type are compared by code1."""
92 if not other:
93 return False
94 assert self.type == other.type
95 return self.code1 == other.code1
97 def __ne__(self, other):
98 return not (self == other)
100 class Sequence(list):
101 """Sequence of Monomers.
103 This behaves like list of monomer objects. In addition to standard list
104 behaviour, Sequence has the following attributes:
106 * name -- str with the name of the sequence
107 * description -- str with description of the sequence
108 * source -- str denoting source of the sequence
110 Any of them may be empty (i.e. hold empty string)
111 """
113 types = base
114 """Mapping of related types. SHOULD be redefined in subclasses."""
116 name = ''
117 description = ''
118 source = ''
120 def __init__(self, *args):
121 self.markups = {}
122 list.__init__(self, *args)
124 @classmethod
125 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
126 """Create sequence from a list of monomer objecst."""
127 result = cls(monomers)
128 if name:
129 result.name = name
130 if description:
131 result.description = description
132 if source:
133 result.source = source
134 return result
136 @classmethod
137 def from_string(cls, string, name='', description='', source=''):
138 """Create sequences from string of one-letter codes."""
139 monomer = cls.types.Monomer.from_code1
140 monomers = [monomer(letter) for letter in string]
141 return cls.from_monomers(monomers, name, description, source)
143 def __repr__(self):
144 if self.name:
145 return '<Sequence %s>' % str(self.name)
146 else:
147 return '<Sequence %s>' % str(self)
150 def __str__(self):
151 """Returns sequence of one-letter codes."""
152 return ''.join(monomer.code1 for monomer in self)
154 def __hash__(self):
155 """Hash sequence by identity."""
156 return id(self)
158 class Alignment(object):
159 """Alignment. It is a list of Columns."""
161 types = base
162 """Mapping of related types. SHOULD be redefined in subclasses."""
164 sequences = None
165 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
167 def __init__(self):
168 """Initialize empty alignment."""
169 self.sequences = []
170 self.columns = []
171 self.markups = {}
173 # Alignment grow & IO methods
174 # ==============================
176 def append_sequence(self, sequence):
177 """Add sequence to alignment. Return self.
179 If sequence is too short, pad it with gaps on the right.
180 """
181 self.sequences.append(sequence)
182 self._pad_to_width(len(sequence))
183 for column, monomer in zip(self.columns, sequence):
184 column[sequence] = monomer
185 return self
187 def append_row_from_string(self, string,
188 name='', description='', source='', gaps=default_gaps):
189 """Add row from a string of one-letter codes and gaps. Return self."""
190 Sequence = self.types.Sequence
191 without_gaps = util.remove_each(string, gaps)
192 sequence = Sequence.from_string(without_gaps, name, description, source)
193 self._pad_to_width(len(string))
194 non_gap_columns = [column
195 for column, char in zip(self.columns, string)
196 if char not in gaps
198 for monomer, column in zip(sequence, non_gap_columns):
199 column[sequence] = monomer
200 self.sequences.append(sequence)
201 return self
203 def append_row_with_gaps(self, row, sequence):
204 """Add row from row_as_list representation and sequence. Return self."""
205 self.sequences.append(sequence)
206 self._pad_to_width(len(row))
207 for column, monomer in zip(self.columns, row):
208 if monomer:
209 column[sequence] = monomer
210 return self
212 def _pad_to_width(self, n):
213 """Pad alignment with empty columns on the right to width n."""
214 for i in range(len(self.columns), n):
215 self.columns.append(Column())
217 def append_file(self, file, format='fasta', gaps=default_gaps):
218 """Append sequences from file to alignment. Return self.
220 If sequences in file have gaps (detected as characters belonging to
221 `gaps` set), treat them accordingly.
222 """
223 sequences = []
224 io = fileio.File(file, format)
225 for name, description, body in io.read_strings():
226 self.append_row_from_string(body, name, description, file.name, gaps)
227 return self
229 def to_file(self, file, format='fasta', gap='-'):
230 """Write alignment in FASTA file as sequences with gaps."""
231 strings = [(s, s.sequence.name, s.sequence.description)
232 for s in self.rows_as_strings()]
233 fileio.File(file, format).write_strings(strings)
235 # Data access methods for alignment
236 # =================================
238 def rows(self):
239 """Return list of rows (temporary objects) in alignment.
241 Each row is a dictionary of { column : monomer }.
243 For gap positions there is no key for the column in row.
245 Each row has attribute `sequence` pointing to the sequence the row is
246 describing.
248 Modifications of row have no effect on the alignment.
249 """
250 # For now, the function returns a list rather than iterator.
251 # It is yet to see, whether memory performance here becomes critical,
252 # or is random access useful.
253 rows = []
254 for sequence in self.sequences:
255 row = util.UserDict()
256 row.sequence = sequence
257 for column in self.columns:
258 if sequence in column:
259 row[column] = column[sequence]
260 rows.append(row)
261 return rows
263 def rows_as_lists(self):
264 """Return list of rows (temporary objects) in alignment.
266 Each row here is a list of either monomer or None (for gaps).
268 Each row has attribute `sequence` pointing to the sequence of row.
270 Modifications of row have no effect on the alignment.
271 """
272 rows = []
273 for sequence in self.sequences:
274 row = util.UserList()
275 row.sequence = sequence
276 for column in self.columns:
277 row.append(column.get(sequence))
278 rows.append(row)
279 return rows
281 def rows_as_strings(self, gap='-'):
282 """Return list of string representation of rows in alignment.
284 Each row has attribute `sequence` pointing to the sequence of row.
286 `gap` is the symbol to use for gap.
287 """
288 rows = []
289 for sequence in self.sequences:
290 string = ""
291 for column in self.columns:
292 if sequence in column:
293 string += column[sequence].code1
294 else:
295 string += gap
296 string = util.UserString(string)
297 string.sequence = sequence
298 rows.append(string)
299 return rows
301 def row_as_list(self, sequence):
302 """Return representaion of row as list with `Monomers` and `None`s."""
303 return [column.get(sequence) for column in self.columns]
305 def row_as_string(self, sequence, gap='-'):
306 """Return string representaion of row in alignment.
308 String will have gaps represented by `gap` symbol (defaults to '-').
309 """
310 def char(monomer):
311 if monomer:
312 return monomer.code1
313 return gap
314 row = self.row_as_list(sequence)
315 return "".join(map(char, row))
317 def columns_as_lists(self):
318 """Return list of columns (temorary objects) in alignment.
320 Each column here is a list of either monomer or None (for gaps).
322 Items of column are sorted in the same way as alignment.sequences.
324 Modifications of column have no effect on the alignment.
325 """
326 columns = []
327 for column in self.columns:
328 col = util.UserList()
329 col.column = column
330 for sequence in self.sequences:
331 col.append(column.get(sequence))
332 columns.append(col)
333 return columns
335 # Alignment / Block editing methods
336 # =================================
338 def flush(self, whence='left'):
339 """Remove all gaps from alignment and flush results to one side.
341 `whence` must be one of 'left', 'right' or 'center'
342 """
343 if whence == 'left':
344 from processors import Left as Flush
345 elif whence == 'right':
346 from processors import Right as Flush
347 elif whence == 'center':
348 from processors import Center as Flush
349 else:
350 raise AssertionError, "Whence must be left, right or center"
351 self.realign(Flush())
353 def remove_gap_columns(self):
354 """Remove all empty columns."""
355 for n, column in reversed(list(enumerate(self.columns))):
356 if column == {}:
357 self.columns[n:n+1] = []
359 def _wipe_row(self, sequence):
360 """Turn all row positions into gaps (but keep sequences intact)."""
361 for column in self.columns:
362 if sequence in column:
363 del column[sequence]
365 def _merge(self, dst, new, merge):
366 """Replace contents of `dst` with those of `new`.
368 Replace contents of elements using function `merge(dst_el, new_le)`.
369 """
370 for el, new_el in zip(dst, new):
371 merge(el, new_el)
372 dst[len(dst):] = new[len(dst):]
373 del dst[len(new):]
375 def _replace_sequence_contents(self, new, copy_descriptions):
376 """Replace contents of sequences with those of `new` alignment."""
377 # XXX: we manually copy sequence contents here
378 # XXX: we only copy, overlapping parts and link to the rest
379 def merge_monomers(dst, new):
380 dst.__class__ = new.__class__
381 def merge_sequences(dst, new):
382 if copy_descriptions:
383 vars(dst).update(vars(new))
384 self._merge(dst, new, merge_monomers)
385 self._merge(self.sequences, new.sequences, merge_sequences)
387 def _replace_column_contents(self, new):
388 """Replace column contents with those of `new` alignment.
390 In other words: copy gap patterns from `new` to `self`.
392 `self.sequences` and `new.sequences` should have the same contents.
393 """
394 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
395 sequence = row.sequence
396 monomers = filter(None, row)
397 assert len(monomers) == len(filter(None, new_row))
398 self._wipe_row(sequence)
399 non_gap_columns = [column
400 for column, monomer in zip(self.columns, new_row)
401 if monomer
403 for monomer, column in zip(monomers, non_gap_columns):
404 column[sequence] = monomer
406 def _replace_contents(self, new, copy_descriptions, copy_contents):
407 """Replace alignment contents with those of other alignment."""
408 if copy_contents:
409 self._replace_sequence_contents(new, copy_descriptions)
410 self._replace_column_contents(new)
412 def process(self, function, copy_descriptions=True, copy_contents=True):
413 """Apply function to the alignment (or block); inject results back.
415 - `function(block)` must return block with same line order.
416 - if `copy_descriptions` is False, ignore new sequence names.
417 - if `copy_contents` is False, don't copy sequence contents too.
419 `function` (object) may have attributes `copy_descriptions` and
420 `copy_contents`, which override the same named arguments.
421 """
422 new = function(self)
423 if hasattr(function, 'copy_descriptions'):
424 copy_descriptions = function.copy_descriptions
425 if hasattr(function, 'copy_contents'):
426 copy_contents = function.copy_contents
427 self._replace_contents(new, copy_descriptions, copy_contents)
429 def realign(self, function):
430 """Realign self.
432 I.e.: apply function to self to produce a new alignment, then update
433 self to have the same gap patterns as the new alignment.
435 This is the same as process(function, False, False)
436 """
437 new = function(self)
438 self._replace_column_contents(new)
440 class Column(dict):
441 """Column of alignment.
443 Column is a dict of { sequence : monomer }.
445 For sequences that have gaps in current row, given key is not present in
446 the column.
447 """
449 types = base
450 """Mapping of related types. SHOULD be redefined in subclasses."""
452 def __hash__(self):
453 """Return hash by identity."""
454 return id(self)
456 class Block(Alignment):
457 """Block of alignment.
459 Block is an intersection of several rows & columns. (The collections of
460 rows and columns are represented as ordered lists, to retain display order
461 of Alignment or add ability to tweak it). Most of blocks look like
462 rectangular part of alignment if you shuffle alignment rows the right way.
463 """
465 alignment = None
466 """Alignment the block belongs to."""
468 sequences = ()
469 """List of sequences in block."""
471 columns = ()
472 """List of columns in block."""
474 @classmethod
475 def from_alignment(cls, alignment, sequences=None, columns=None):
476 """Build new block from alignment.
478 If sequences are not given, the block uses all sequences in alignment.
480 If columns are not given, the block uses all columns in alignment.
482 In both cases we use exactly the list used in alignment, thus, if new
483 sequences or columns are added to alignment, the block tracks this too.
484 """
485 if sequences is None:
486 sequences = alignment.sequences
487 if columns is None:
488 columns = alignment.columns
489 block = cls()
490 block.alignment = alignment
491 block.sequences = sequences
492 block.columns = columns
493 return block
495 class SequenceMarkup(object):
497 name = None
498 """Name of markup in owner sequence"""
500 def __init__(self, sequence, name=None):
501 if name:
502 self.name = name
503 assert self.name is not None
504 assert self.name not in sequence.markups
505 self.sequence = sequence
506 sequence.markups[self.name] = self
507 self.refresh()
509 def refresh(self):
510 pass
512 def __contains__(self, monomer):
513 return monomer in self.sequence
515 def __getitem__(self, monomer):
516 return getattr(monomer, self.name)
518 def __setitem__(self, monomer, value):
519 return setattr(monomer, self.name, value)
521 class AlignmentMarkup(dict):
523 name = None
524 """Name of markup in owner alignment"""
526 def __init__(self, alignment, name=None):
527 if name:
528 self.name = name
529 assert self.name is not None
530 assert self.name not in alignment.markups
531 self.alignment = alignment
532 alignment.markups[self.name] = self
533 self.refresh()
535 def refresh(self):
536 pass
538 # vim: set ts=4 sts=4 sw=4 et: