Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/0377cd9ac4e6/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 21:55:34 2013
Кодировка:
allpy: 0377cd9ac4e6 allpy/base.py

allpy

view allpy/base.py @ 717:0377cd9ac4e6

Added NEWS entry for markup file format
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Fri, 08 Jul 2011 12:09:39 +0400
parents 21cfc7897a8f
children fd531580b9af
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = set((".", "-", "~"))
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 cls.by_code3[code3] = TheMonomer
56 cls.by_name[name] = TheMonomer
57 # We duplicate distinguished long names into Monomer itself, so that we
58 # can use Monomer.from_code3 to create the relevant type of monomer.
59 Monomer.by_code3[code3] = TheMonomer
60 Monomer.by_name[name] = TheMonomer
62 @classmethod
63 def _initialize(cls, codes=None):
64 """Create all relevant subclasses of Monomer."""
65 for code1, is_modified, code3, name in codes:
66 cls._subclass(name, code1, code3, is_modified)
68 @classmethod
69 def from_code1(cls, code1):
70 """Create new monomer from 1-letter code."""
71 monomer = cls.by_code1[code1.upper()]()
72 monomer.input_code1 = code1
73 return monomer
75 @classmethod
76 def from_code3(cls, code3):
77 """Create new monomer from 3-letter code."""
78 return cls.by_code3[code3.upper()]()
80 @classmethod
81 def from_name(cls, name):
82 """Create new monomer from full name."""
83 return cls.by_name[name.strip().capitalize()]()
85 def __repr__(self):
86 return "<Monomer %s>" % str(self.code1)
88 def __str__(self):
89 """Returns one-letter code"""
90 return self.code1
92 def __eq__(self, other):
93 """Monomers within same monomer type are compared by code1."""
94 if not other:
95 return False
96 assert self.type == other.type
97 return self.code1 == other.code1
99 def __ne__(self, other):
100 return not (self == other)
102 class Sequence(list):
103 """Sequence of Monomers.
105 This behaves like list of monomer objects. In addition to standard list
106 behaviour, Sequence has the following attributes:
108 * name -- str with the name of the sequence
109 * description -- str with description of the sequence
110 * source -- str denoting source of the sequence
112 Any of them may be empty (i.e. hold empty string)
113 """
115 types = base
116 """Mapping of related types. SHOULD be redefined in subclasses."""
118 name = ''
119 description = ''
120 source = ''
122 def __init__(self, *args):
123 self.markups = {}
124 list.__init__(self, *args)
126 @classmethod
127 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
128 """Create sequence from a list of monomer objecst."""
129 result = cls(monomers)
130 if name:
131 result.name = name
132 if description:
133 result.description = description
134 if source:
135 result.source = source
136 return result
138 @classmethod
139 def from_string(cls, string, name='', description='', source=''):
140 """Create sequences from string of one-letter codes."""
141 monomer = cls.types.Monomer.from_code1
142 monomers = [monomer(letter) for letter in string]
143 return cls.from_monomers(monomers, name, description, source)
145 def __repr__(self):
146 if self.name:
147 return '<Sequence %s>' % str(self.name)
148 else:
149 return '<Sequence %s>' % str(self)
151 def __str__(self):
152 """Returns sequence of one-letter codes."""
153 return ''.join(monomer.code1 for monomer in self)
155 def __hash__(self):
156 """Hash sequence by identity."""
157 return id(self)
159 class Alignment(object):
160 """Alignment. It is a list of Columns."""
162 types = base
163 """Mapping of related types. SHOULD be redefined in subclasses."""
165 sequences = None
166 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
168 def __init__(self):
169 """Initialize empty alignment."""
170 self.sequences = []
171 self.columns = []
172 self.markups = {}
174 # Alignment grow & IO methods
175 # ==============================
177 def append_sequence(self, sequence):
178 """Add sequence to alignment. Return self.
180 If sequence is too short, pad it with gaps on the right.
181 """
182 self.sequences.append(sequence)
183 self._pad_to_width(len(sequence))
184 for column, monomer in zip(self.columns, sequence):
185 column[sequence] = monomer
186 return self
188 def append_row_from_string(self, string,
189 name='', description='', source='', gaps=default_gaps):
190 """Add row from a string of one-letter codes and gaps. Return self."""
191 Sequence = self.types.Sequence
192 without_gaps = util.remove_each(string, gaps)
193 sequence = Sequence.from_string(without_gaps, name, description, source)
194 self._pad_to_width(len(string))
195 non_gap_columns = [column
196 for column, char in zip(self.columns, string)
197 if char not in gaps
199 for monomer, column in zip(sequence, non_gap_columns):
200 column[sequence] = monomer
201 self.sequences.append(sequence)
202 return self
204 def append_row_with_gaps(self, row, sequence):
205 """Add row from row_as_list representation and sequence. Return self."""
206 self.sequences.append(sequence)
207 self._pad_to_width(len(row))
208 for column, monomer in zip(self.columns, row):
209 if monomer:
210 column[sequence] = monomer
211 return self
213 def _pad_to_width(self, n):
214 """Pad alignment with empty columns on the right to width n."""
215 for i in range(len(self.columns), n):
216 self.columns.append(Column())
218 def append_file(self, file, format='fasta', gaps=default_gaps):
219 """Append sequences from file to alignment. Return self.
221 If sequences in file have gaps (detected as characters belonging to
222 `gaps` set), treat them accordingly.
223 """
224 fileio.File(file, format, gaps=gaps).read_alignment(self)
225 return self
227 def to_file(self, file, format='fasta', gap='-'):
228 """Write alignment in FASTA file as sequences with gaps."""
229 fileio.File(file, format, gaps=gap).write_alignment(self)
230 return self
232 # Data access methods for alignment
233 # =================================
235 def rows(self):
236 """Return list of rows (temporary objects) in alignment.
238 Each row is a dictionary of { column : monomer }.
240 For gap positions there is no key for the column in row.
242 Each row has attribute `sequence` pointing to the sequence the row is
243 describing.
245 Modifications of row have no effect on the alignment.
246 """
247 # For now, the function returns a list rather than iterator.
248 # It is yet to see, whether memory performance here becomes critical,
249 # or is random access useful.
250 rows = []
251 for sequence in self.sequences:
252 row = util.UserDict()
253 row.sequence = sequence
254 for column in self.columns:
255 if sequence in column:
256 row[column] = column[sequence]
257 rows.append(row)
258 return rows
260 def rows_as_lists(self):
261 """Return list of rows (temporary objects) in alignment.
263 Each row here is a list of either monomer or None (for gaps).
265 Each row has attribute `sequence` pointing to the sequence of row.
267 Modifications of row have no effect on the alignment.
268 """
269 rows = []
270 for sequence in self.sequences:
271 row = util.UserList()
272 row.sequence = sequence
273 for column in self.columns:
274 row.append(column.get(sequence))
275 rows.append(row)
276 return rows
278 def rows_as_strings(self, gap='-'):
279 """Return list of string representation of rows in alignment.
281 Each row has attribute `sequence` pointing to the sequence of row.
283 `gap` is the symbol to use for gap.
284 """
285 rows = []
286 for sequence in self.sequences:
287 string = ""
288 for column in self.columns:
289 if sequence in column:
290 string += column[sequence].code1
291 else:
292 string += gap
293 string = util.UserString(string)
294 string.sequence = sequence
295 rows.append(string)
296 return rows
298 def row_as_list(self, sequence):
299 """Return representaion of row as list with `Monomers` and `None`s."""
300 return [column.get(sequence) for column in self.columns]
302 def row_as_string(self, sequence, gap='-'):
303 """Return string representaion of row in alignment.
305 String will have gaps represented by `gap` symbol (defaults to '-').
306 """
307 def char(monomer):
308 if monomer:
309 return monomer.code1
310 return gap
311 row = self.row_as_list(sequence)
312 return "".join(map(char, row))
314 def columns_as_lists(self):
315 """Return list of columns (temorary objects) in alignment.
317 Each column here is a list of either monomer or None (for gaps).
319 Items of column are sorted in the same way as alignment.sequences.
321 Modifications of column have no effect on the alignment.
322 """
323 columns = []
324 for column in self.columns:
325 col = util.UserList()
326 col.column = column
327 for sequence in self.sequences:
328 col.append(column.get(sequence))
329 columns.append(col)
330 return columns
332 # Alignment / Block editing methods
333 # =================================
335 def flush(self, whence='left'):
336 """Remove all gaps from alignment and flush results to one side.
338 `whence` must be one of 'left', 'right' or 'center'
339 """
340 if whence == 'left':
341 from processors import Left as Flush
342 elif whence == 'right':
343 from processors import Right as Flush
344 elif whence == 'center':
345 from processors import Center as Flush
346 else:
347 raise AssertionError, "Whence must be left, right or center"
348 self.realign(Flush())
350 def remove_gap_columns(self):
351 """Remove all empty columns."""
352 for n, column in reversed(list(enumerate(self.columns))):
353 if column == {}:
354 self.columns[n:n+1] = []
356 def _wipe_row(self, sequence):
357 """Turn all row positions into gaps (but keep sequences intact)."""
358 for column in self.columns:
359 if sequence in column:
360 del column[sequence]
362 def _merge(self, dst, new, merge):
363 """Replace contents of `dst` with those of `new`.
365 Replace contents of elements using function `merge(dst_el, new_le)`.
366 """
367 for el, new_el in zip(dst, new):
368 merge(el, new_el)
369 dst[len(dst):] = new[len(dst):]
370 del dst[len(new):]
372 def _replace_sequence_contents(self, new, copy_descriptions):
373 """Replace contents of sequences with those of `new` alignment."""
374 # XXX: we manually copy sequence contents here
375 # XXX: we only copy, overlapping parts and link to the rest
376 def merge_monomers(dst, new):
377 dst.__class__ = new.__class__
378 def merge_sequences(dst, new):
379 if copy_descriptions:
380 vars(dst).update(vars(new))
381 self._merge(dst, new, merge_monomers)
382 self._merge(self.sequences, new.sequences, merge_sequences)
384 def _replace_column_contents(self, new):
385 """Replace column contents with those of `new` alignment.
387 In other words: copy gap patterns from `new` to `self`.
389 `self.sequences` and `new.sequences` should have the same contents.
390 """
391 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
392 sequence = row.sequence
393 monomers = filter(None, row)
394 assert len(monomers) == len(filter(None, new_row))
395 self._wipe_row(sequence)
396 non_gap_columns = [column
397 for column, monomer in zip(self.columns, new_row)
398 if monomer
400 for monomer, column in zip(monomers, non_gap_columns):
401 column[sequence] = monomer
403 def _replace_contents(self, new, copy_descriptions, copy_contents):
404 """Replace alignment contents with those of other alignment."""
405 if copy_contents:
406 self._replace_sequence_contents(new, copy_descriptions)
407 self._replace_column_contents(new)
409 def process(self, function, copy_descriptions=True, copy_contents=True):
410 """Apply function to the alignment (or block); inject results back.
412 - `function(block)` must return block with same line order.
413 - if `copy_descriptions` is False, ignore new sequence names.
414 - if `copy_contents` is False, don't copy sequence contents too.
416 `function` (object) may have attributes `copy_descriptions` and
417 `copy_contents`, which override the same named arguments.
418 """
419 new = function(self)
420 if hasattr(function, 'copy_descriptions'):
421 copy_descriptions = function.copy_descriptions
422 if hasattr(function, 'copy_contents'):
423 copy_contents = function.copy_contents
424 self._replace_contents(new, copy_descriptions, copy_contents)
426 def realign(self, function):
427 """Realign self.
429 I.e.: apply function to self to produce a new alignment, then update
430 self to have the same gap patterns as the new alignment.
432 This is the same as process(function, False, False)
433 """
434 new = function(self)
435 self._replace_column_contents(new)
437 class Column(dict):
438 """Column of alignment.
440 Column is a dict of { sequence : monomer }.
442 For sequences that have gaps in current row, given key is not present in
443 the column.
444 """
446 types = base
447 """Mapping of related types. SHOULD be redefined in subclasses."""
449 def __hash__(self):
450 """Return hash by identity."""
451 return id(self)
453 class Block(Alignment):
454 """Block of alignment.
456 Block is an intersection of several rows & columns. (The collections of
457 rows and columns are represented as ordered lists, to retain display order
458 of Alignment or add ability to tweak it). Most of blocks look like
459 rectangular part of alignment if you shuffle alignment rows the right way.
460 """
462 alignment = None
463 """Alignment the block belongs to."""
465 sequences = ()
466 """List of sequences in block."""
468 columns = ()
469 """List of columns in block."""
471 @classmethod
472 def from_alignment(cls, alignment, sequences=None, columns=None):
473 """Build new block from alignment.
475 If sequences are not given, the block uses all sequences in alignment.
477 If columns are not given, the block uses all columns in alignment.
479 In both cases we use exactly the list used in alignment, thus, if new
480 sequences or columns are added to alignment, the block tracks this too.
481 """
482 if sequences is None:
483 sequences = alignment.sequences
484 if columns is None:
485 columns = alignment.columns
486 block = cls()
487 block.alignment = alignment
488 block.sequences = sequences
489 block.columns = columns
490 return block
492 class Markup(object):
493 """Base class for sequence and alignment markups.
495 We shall call either sequence or alignment a container. And we shall call
496 either monomers or columns elements respectively.
498 Markup behaves like a dictionary of [element] -> value.
500 Every container has a dictionary of [name] -> markup. It is Markup's
501 responsibility to add itself to this dictionary and to avoid collisions
502 while doing it.
503 """
505 name = None
506 """Name of markup elements"""
508 def _register(self, container, name):
509 """Register self within container.
511 Assure the name is not taken before. If name is not given, look in the
512 class. Make sure we have some name at all.
513 """
514 if name:
515 self.name = name
516 assert self.name is not None
517 assert self.name not in container.markups
518 container.markups[self.name] = self
520 def refresh(self):
521 """Recalculate markup values (if they are generated automatically)."""
522 pass
524 @classmethod
525 def from_record(cls, container, record, name=None):
526 """Restore markup from `record`. (Used for loading from file).
528 `record` is a dict of all metadata and data related to one markup. All
529 keys and values in `record` are strings, markup must parse them itself.
531 Markup values should be stored in `record['markup']`, which is a list
532 of items separated with either `record['separator']` or a comma.
533 """
534 return cls(container, name)
536 def to_record(self):
537 """Save markup to `record`, for saving to file.
539 For description of `record` see docstring for `from_record` method.
540 """
541 return {}
543 def sorted_keys(self):
544 """Return list of elements in the container in proper order."""
545 raise NotImplementedError()
547 def sorted_values(self):
548 """Return list of markup values in container."""
549 raise NotImplementedError()
551 class SequenceMarkup(Markup):
552 """Markup for sequence.
554 Behaves like a dictionary of [monomer] -> value. Value may be anything
555 or something specific, depending on subclass.
557 Actual values are stored in monomers themselves as attributes.
558 """
560 def __init__(self, sequence, name=None):
561 self.sequence = sequence
562 self._register(sequence, name)
563 self.refresh()
565 def sorted_keys(self):
566 """Return list of monomers."""
567 return self.sequence
569 def sorted_values(self):
570 """Return list of markup values, if every monomer is marked up."""
571 return (self[monomer] for monomer in self.sequence)
573 def get(self, key, value=None):
574 """Part of Mapping collection interface."""
575 if key not in self:
576 return value
577 return self[key]
579 def __contains__(self, monomer):
580 """Part of Mapping collection interface."""
581 return hasattr(monomer, self.name)
583 def __getitem__(self, monomer):
584 """Part of Mapping collection interface."""
585 return getattr(monomer, self.name)
587 def __setitem__(self, monomer, value):
588 """Part of Mapping collection interface."""
589 return setattr(monomer, self.name, value)
591 class AlignmentMarkup(dict, Markup):
592 """Markupf for alignment.
594 Is a dictionary of [column] -> value. Value may be anything or something
595 specific, depending on subclass.
596 """
598 def __init__(self, alignment, name=None):
599 self.alignment = alignment
600 self._register(alignment, name)
601 self.refresh()
603 def sorted_keys(self):
604 """Return a list of columns."""
605 return self.alignment.columns
607 def sorted_values(self):
608 """Return a list of makrup values, if every column is marked up."""
609 return (self[column] for column in self.alignment.columns)
611 # vim: set ts=4 sts=4 sw=4 et: