Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/2b3cad50c2b1/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:16 2013
Кодировка:
allpy: 2b3cad50c2b1 allpy/base.py

allpy

view allpy/base.py @ 1106:2b3cad50c2b1

Partially reversed [afed1f] (see #49) As explained in the ticket, in real life usecases having a monomer belong to several sequences is sometimes extremely useful. ANY approach to attribution of monomer to only one sequence will be either confusing or hindering. * Removed `monomer.sequence` attribute * Removed unncecessary specialcasing in pickle * Removed unused tests * Restored APIs to backward-compatible * Added deprecated messages to the restored APIs
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Sun, 10 Jun 2012 16:08:47 +0400
parents 41a167bbf150
children 79978caa35ee
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = ("-", ".", "~")
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 @classmethod
33 def _subclass(cls, name='', code1='', code3='', is_modified=False):
34 """Create new subclass of Monomer for given monomer type."""
35 class TheMonomer(cls):
36 pass
37 name = name.strip().capitalize()
38 code1 = code1.upper()
39 code3 = code3.upper()
40 module = vars(data.monomers)[cls.type]
41 TheMonomer.__name__ = re.sub(r"\W", "_", name)
42 TheMonomer.__module__ = module.__name__
43 TheMonomer.name = name
44 TheMonomer.code1 = code1
45 TheMonomer.code3 = code3
46 TheMonomer.is_modified = is_modified
47 # Save the class in data.monomers so that it can be pickled
48 # Some names are not unique, we append underscores to them
49 # in order to fix it.
50 while TheMonomer.__name__ in vars(module):
51 TheMonomer.__name__ += "_"
52 vars(module)[TheMonomer.__name__] = TheMonomer
53 if not is_modified:
54 cls.by_code1[code1] = TheMonomer
55 if code3 not in cls.by_code3 or not is_modified:
56 cls.by_code3[code3] = TheMonomer
57 cls.by_name[name] = TheMonomer
58 # We duplicate distinguished long names into Monomer itself, so that we
59 # can use Monomer.from_code3 to create the relevant type of monomer.
60 if code3 not in Monomer.by_code3 or not is_modified:
61 Monomer.by_code3[code3] = TheMonomer
62 Monomer.by_name[name] = TheMonomer
64 @classmethod
65 def _initialize(cls, codes=None):
66 """Create all relevant subclasses of Monomer."""
67 for code1, is_modified, code3, name in codes:
68 cls._subclass(name, code1, code3, is_modified)
70 @classmethod
71 def from_code1(cls, code1):
72 """Create new monomer from 1-letter code."""
73 deprecated(
74 "Monomer.from_code1(...) is deprecated in favor of Sequence.append_monomer(code1=...)"
75 )
76 monomer = cls.by_code1[code1.upper()]()
77 monomer.input_code1 = code1
78 return monomer
80 @classmethod
81 def from_code3(cls, code3):
82 """Create new monomer from 3-letter code."""
83 deprecated(
84 "Monomer.from_code3(...) is deprecated in favor of Sequence.append_monomer(code3=...)"
85 )
86 return cls.by_code3[code3.upper()]()
88 @classmethod
89 def from_name(cls, name):
90 """Create new monomer from full name."""
91 deprecated(
92 "Monomer.from_name(...) is deprecated in favor of Sequence.append_monomer(name=...)"
93 )
94 return cls.by_name[name.strip().capitalize()]()
96 def __repr__(self):
97 return "<Monomer %s>" % str(self.code1)
99 def __str__(self):
100 """Returns one-letter code"""
101 return self.code1
103 def __eq__(self, other):
104 """Monomers within same monomer type are compared by code1."""
105 if not other:
106 return False
107 assert self.type == other.type
108 return self.code1 == other.code1
110 def __ne__(self, other):
111 return not (self == other)
113 class MarkupContainerMixin(object):
114 """Common functions for alignment and sequence for dealing with markups.
115 """
117 def _init(self):
118 """Hook to be called from __init__ of actual class."""
119 self.markups = {}
121 def add_markup(self, name, markup_class=None, use_existing=False, **kws):
122 """Create a markup object, add to self. Return the created markup.
124 - `name` is name for markup in `self.markups` dictionary
125 - optional `markup_class` is class for created markup
126 - if optional `use_existing` is true, it is no error, if same named
127 markup already exists (in this case, nothing is changed)
128 - optional keyword arguments are passed on to the markup constructor
130 For user markups you have to specify `name` and `markup_class`,
131 for the standard automatical markups just `name` is enough.
132 """
133 # We have to import markups here, and not in the module header
134 # so as not to create bad import loops.
135 # `base` module is used extensively in `markups` for inherinance,
136 # so breaking the loop here seems a lot easier.
137 import markups
138 if markup_class is None:
139 kind = self.kind + "_" + "markup"
140 markup_class = markups.by_name[kind, name]
141 if use_existing and name in self.markups:
142 assert self.markups[name].__class__ is markup_class
143 return self.markups[name]
144 assert name not in self.markups
145 markup = markup_class(self, name, caller='container', **kws)
146 self.markups[name] = markup
147 return markup
149 def remove_markup(self, name):
150 """Remove markup."""
151 self.markups[name].remove()
152 del self.markups[name]
154 class Sequence(list, MarkupContainerMixin):
155 """Sequence of Monomers.
157 This behaves like list of monomer objects. In addition to standard list
158 behaviour, Sequence has the following attributes:
160 * name -- str with the name of the sequence
161 * description -- str with description of the sequence
162 * source -- str denoting source of the sequence
164 Any of them may be empty (i.e. hold empty string)
165 """
167 types = base
168 """Mapping of related types. SHOULD be redefined in subclasses."""
170 kind = 'sequence'
171 """Description of object kind."""
173 name = ''
174 """Squence identifier."""
176 description = ''
177 """Detailed sequence description."""
179 source = ''
180 """Sequence source."""
182 def __init__(self, sequence=(), name='', description='', source=''):
183 list.__init__(self, sequence)
184 MarkupContainerMixin._init(self)
186 self.name = name
187 self.description = description
188 self.source = source
190 def append_monomer(self, code1=None, code3=None, name=None):
191 """Append a new monomer to the sequence. Return the new monomer."""
192 assert bool(code1) + bool(code3) + bool(name) == 1, \
193 "Please specify exactly one of: code1, code3, name"
194 if code1:
195 cls = self.types.Monomer.by_code1[code1.upper()]
196 elif code3:
197 cls = self.types.Monomer.by_code3[code3.upper()]
198 elif name:
199 cls = self.types.Monomer.by_name[name.strip().capitalize()]
200 monomer = cls()
201 monomer.input_code1 = code1
202 self.append(monomer)
203 return monomer
205 @classmethod
206 def from_monomers(cls, monomers=[], name='', description='', source=''):
207 """Create sequence from a list of monomer objecst."""
208 deprecated(
209 "Sequence.from_monomers(...) is deprecated in favor of Sequence(...)"
211 return cls(monomers, name, description, source)
213 @classmethod
214 def from_string(cls, string, name='', description='', source=''):
215 """Create sequences from string of one-letter codes."""
216 self = cls([], name=name, description=description, source=source)
217 for letter in string:
218 self.append_monomer(code1=letter)
219 return self
221 def __repr__(self):
222 if self.name:
223 return '<Sequence %s>' % str(self.name)
224 else:
225 return '<Sequence %s>' % str(self)
227 def __str__(self):
228 """Returns sequence of one-letter codes."""
229 return ''.join(monomer.code1 for monomer in self)
231 def __hash__(self):
232 """Hash sequence by identity."""
233 return id(self)
235 class Alignment(MarkupContainerMixin):
236 """Alignment. It is a list of Columns."""
238 types = base
239 """Mapping of related types. SHOULD be redefined in subclasses."""
241 sequences = None
242 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
244 kind = 'alignment'
245 """Description of object kind."""
247 def __init__(self):
248 """Initialize empty alignment."""
249 self.sequences = []
250 self.columns = []
251 MarkupContainerMixin._init(self)
253 # Alignment grow & IO methods
254 # ==============================
256 def append_sequence(self, sequence):
257 """Add sequence to alignment. Return self.
259 If sequence is too short, pad it with gaps on the right.
260 """
261 self.sequences.append(sequence)
262 self._pad_to_width(len(sequence))
263 for column, monomer in zip(self.columns, sequence):
264 column[sequence] = monomer
265 return self
267 def append_row_from_string(self, string,
268 name='', description='', source='', gaps=default_gaps):
269 """Add row from a string of one-letter codes and gaps. Return self."""
270 Sequence = self.types.Sequence
271 without_gaps = util.remove_each(string, gaps)
272 sequence = Sequence.from_string(without_gaps, name, description, source)
273 self._pad_to_width(len(string))
274 non_gap_columns = [column
275 for column, char in zip(self.columns, string)
276 if char not in gaps
278 for monomer, column in zip(sequence, non_gap_columns):
279 column[sequence] = monomer
280 self.sequences.append(sequence)
281 return self
283 def append_row_with_gaps(self, row, sequence):
284 """Add row from row_as_list representation and sequence. Return self."""
285 self.sequences.append(sequence)
286 self._pad_to_width(len(row))
287 for column, monomer in zip(self.columns, row):
288 if monomer:
289 column[sequence] = monomer
290 return self
292 def _append_columns(self, n, columns):
293 """Insert list of `columns` after position `n`."""
294 self.columns[n+1:n+1] = columns
296 def _pad_to_width(self, n):
297 """Pad alignment with empty columns on the right to width n."""
298 columns = [self.types.Column() for _ in range(len(self.columns), n)]
299 self._append_columns(len(self.columns)-1, columns)
301 def append_file(self, file, format='fasta', gaps=default_gaps):
302 """Append sequences from file to alignment. Return self.
304 If sequences in file have gaps (detected as characters belonging to
305 `gaps` set), treat them accordingly.
306 """
307 fileio.File(file, format, gaps=gaps).read_alignment(self)
308 return self
310 def to_file(self, file, format='fasta', gap='-'):
311 """Write alignment in FASTA file as sequences with gaps."""
312 fileio.File(file, format, gaps=gap).write_alignment(self)
313 return self
315 # Data access methods for alignment
316 # =================================
318 def rows(self):
319 """Return list of rows (temporary objects) in alignment.
321 Each row is a dictionary of { column : monomer }.
323 For gap positions there is no key for the column in row.
325 Each row has attribute `sequence` pointing to the sequence the row is
326 describing.
328 Modifications of row have no effect on the alignment.
329 """
330 # For now, the function returns a list rather than iterator.
331 # It is yet to see, whether memory performance here becomes critical,
332 # or is random access useful.
333 rows = []
334 for sequence in self.sequences:
335 row = util.UserDict()
336 row.sequence = sequence
337 for column in self.columns:
338 if sequence in column:
339 row[column] = column[sequence]
340 rows.append(row)
341 return rows
343 def rows_as_lists(self):
344 """Return list of rows (temporary objects) in alignment.
346 Each row here is a list of either monomer or None (for gaps).
348 Each row has attribute `sequence` pointing to the sequence of row.
350 Modifications of row have no effect on the alignment.
351 """
352 rows = []
353 for sequence in self.sequences:
354 row = util.UserList()
355 row.sequence = sequence
356 for column in self.columns:
357 row.append(column.get(sequence))
358 rows.append(row)
359 return rows
361 def rows_as_strings(self, gap='-'):
362 """Return list of string representation of rows in alignment.
364 Each row has attribute `sequence` pointing to the sequence of row.
366 `gap` is the symbol to use for gap.
367 """
368 rows = []
369 for sequence in self.sequences:
370 string = ""
371 for column in self.columns:
372 if sequence in column:
373 string += column[sequence].code1
374 else:
375 string += gap
376 string = util.UserString(string)
377 string.sequence = sequence
378 rows.append(string)
379 return rows
381 def row_as_list(self, sequence):
382 """Return representaion of row as list with `Monomers` and `None`s."""
383 return [column.get(sequence) for column in self.columns]
385 def row_as_string(self, sequence, gap='-'):
386 """Return string representaion of row in alignment.
388 String will have gaps represented by `gap` symbol (defaults to '-').
389 """
390 def char(monomer):
391 if monomer:
392 return monomer.code1
393 return gap
394 row = self.row_as_list(sequence)
395 return "".join(map(char, row))
397 def columns_as_lists(self):
398 """Return list of columns (temorary objects) in alignment.
400 Each column here is a list of either monomer or None (for gaps).
402 Items of column are sorted in the same way as alignment.sequences.
404 Modifications of column have no effect on the alignment.
405 """
406 columns = []
407 for column in self.columns:
408 col = util.UserList()
409 col.column = column
410 for sequence in self.sequences:
411 col.append(column.get(sequence))
412 columns.append(col)
413 return columns
415 # Alignment / Block editing methods
416 # =================================
418 def flush(self, whence='left'):
419 """Remove all gaps from alignment and flush results to one side.
421 `whence` must be one of 'left', 'right' or 'center'
422 """
423 deprecated(
424 "aln.flush('left') is deprecated in favor of aln.realign(Left())"
426 if whence == 'left':
427 from processors import Left as Flush
428 elif whence == 'right':
429 from processors import Right as Flush
430 elif whence == 'center':
431 from processors import Center as Flush
432 else:
433 raise AssertionError, "Whence must be left, right or center"
434 self.realign(Flush())
436 def remove_gap_columns(self):
437 """Remove all empty columns."""
438 for n, column in reversed(list(enumerate(self.columns))):
439 if not any(seq in column for seq in self.sequences):
440 self.columns[n:n+1] = []
442 def _wipe_row(self, sequence):
443 """Turn all row positions into gaps (but keep sequences intact)."""
444 for column in self.columns:
445 if sequence in column:
446 del column[sequence]
448 def _replace_column_contents(self, new):
449 """Replace column contents with those of `new` alignment.
451 In other words: copy gap patterns from `new` to `self`.
453 `self.sequences` and `new.sequences` should have the same contents.
454 """
455 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
456 sequence = row.sequence
457 monomers = filter(None, row)
458 assert len(monomers) == len(filter(None, new_row))
459 self._wipe_row(sequence)
460 self._pad_to_width(len(new_row))
461 non_gap_columns = [column
462 for column, monomer in zip(self.columns, new_row)
463 if monomer
465 assert len(monomers) == len(non_gap_columns)
466 for monomer, column in zip(monomers, non_gap_columns):
467 column[sequence] = monomer
469 def realign(self, function):
470 """Realign self.
472 * apply function to self to produce a new alignment,
473 * update self to have the same gap patterns as the new alignment.
474 """
475 new = function(self)
476 self._replace_column_contents(new)
478 class Column(dict):
479 """Column of alignment.
481 Column is a dict of { sequence : monomer }.
483 For sequences that have gaps in current row, given key is not present in
484 the column.
485 """
487 types = base
488 """Mapping of related types. SHOULD be redefined in subclasses."""
490 def __hash__(self):
491 """Return hash by identity."""
492 return id(self)
495 class Block(Alignment):
496 """Block of alignment.
498 Block is an intersection of several rows & columns. (The collections of
499 rows and columns are represented as ordered lists, to retain display order
500 of Alignment or add ability to tweak it). Most of blocks look like
501 rectangular part of alignment if you shuffle alignment rows the right way.
502 """
504 alignment = None
505 """Alignment the block belongs to."""
507 sequences = ()
508 """List of sequences in block."""
510 columns = ()
511 """List of columns in block."""
513 @classmethod
514 def from_alignment(cls, alignment, sequences=None, columns=None):
515 """Build new block from alignment.
517 If sequences are not given, the block uses all sequences in alignment.
519 If columns are not given, the block uses all columns in alignment.
521 In both cases we use exactly the list used in alignment, thus, if new
522 sequences or columns are added to alignment, the block tracks this too.
523 """
524 if sequences is None:
525 sequences = alignment.sequences
526 if columns is None:
527 columns = alignment.columns
528 block = cls()
529 block.alignment = alignment
530 block.sequences = sequences
531 block.columns = columns
532 return block
534 def _append_columns(self, n, columns):
535 """Insert list of `columns` after position `n`."""
536 assert len(self.columns) != 0, "Can't append columns to an empty Block"
537 target = self.columns[n]
538 for k, column in enumerate(self.alignment.columns):
539 if column is target:
540 me = k
541 if self.columns is not self.alignment.columns:
542 self.alignment._append_columns(me, columns)
543 self.columns[n+1:n+1] = columns
545 class Markup(object):
546 """Base class for sequence and alignment markups.
548 We shall call either sequence or alignment a container. And we shall call
549 either monomers or columns elements respectively.
551 Markup behaves like a dictionary of [element] -> value.
553 Every container has a dictionary of [name] -> markup. It is Markup's
554 responsibility to add itself to this dictionary and to avoid collisions
555 while doing it.
556 """
558 name = None
559 """Name of markup elements."""
561 save = True
562 """If set to false, fileio should not save this markup."""
564 def __init__(self, container, name, **kwargs):
565 """Markup takes mandatory container and name and optional kwargs.
567 Markups should never be created by the user. They are created by
568 Sequence or Alignment.
569 """
570 self.name = name
571 assert kwargs.get('caller') == 'container', "Improper call"
572 self.refresh()
574 def refresh(self):
575 """Recalculate markup values (if they are generated automatically)."""
576 pass
578 def remove(self):
579 """Remove the traces of markup object. Do not call this yourself!"""
580 pass
582 @classmethod
583 def from_record(cls, container, record, name=None):
584 """Restore markup from `record`. (Used for loading from file).
586 `record` is a dict of all metadata and data related to one markup. All
587 keys and values in `record` are strings, markup must parse them itself.
589 Markup values should be stored in `record['markup']`, which is a list
590 of items separated with either `record['separator']` or a comma.
591 """
592 return container.add_markup(name, markup_class=cls)
594 def to_record(self, keys=None):
595 """Save markup to `record`, for saving to file.
597 For description of `record` see docstring for `from_record` method.
599 If `keys` argument is given, restrict output to the given keys.
600 """
601 return {}
603 def sorted_keys(self):
604 """Return list of elements in the container in proper order."""
605 raise NotImplementedError()
607 def sorted_values(self, **kw):
608 """Return list of markup values in container.
610 Possible arguments:
612 - `map` -- a function, applied to each existing value
613 - `default` -- a value to return for non-existing values
615 If `default` is not specified, the function fails on markups that do
616 not have all of the values set.
617 """
618 default_exists = 'default' in kw
619 default = kw.get('default')
620 map = kw.get('map', lambda x: x)
621 for item in self.sorted_keys():
622 if item not in self and default_exists:
623 yield default
624 else:
625 yield map(self[item])
627 class SequenceMarkup(Markup):
628 """Markup for sequence.
630 Behaves like a dictionary of [monomer] -> value. Value may be anything
631 or something specific, depending on subclass.
633 Actual values are stored in monomers themselves as attributes.
634 """
636 kind = 'sequence_markup'
638 def __init__(self, sequence, name, **kwargs):
639 self.sequence = sequence
640 Markup.__init__(self, sequence, name, **kwargs)
642 def remove(self):
643 """Remove the traces of markup object. Do not call this yourself!"""
644 for monomer in self.sequence:
645 del self[monomer]
647 def sorted_keys(self):
648 """Return list of monomers."""
649 return self.sequence
651 def get(self, key, value=None):
652 """Part of Mapping collection interface."""
653 if key not in self:
654 return value
655 return self[key]
657 def __contains__(self, monomer):
658 """Part of Mapping collection interface."""
659 return hasattr(monomer, self.name)
661 def __getitem__(self, monomer):
662 """Part of Mapping collection interface."""
663 return getattr(monomer, self.name)
665 def __setitem__(self, monomer, value):
666 """Part of Mapping collection interface."""
667 return setattr(monomer, self.name, value)
669 def __delitem__(self, monomer):
670 """Part of Mapping collection interface."""
671 return delattr(monomer, self.name)
673 class AlignmentMarkup(dict, Markup):
674 """Markupf for alignment.
676 Is a dictionary of [column] -> value. Value may be anything or something
677 specific, depending on subclass.
678 """
680 kind = 'alignment_markup'
682 def __init__(self, alignment, name, **kwargs):
683 self.alignment = alignment
684 Markup.__init__(self, alignment, name, **kwargs)
686 def sorted_keys(self):
687 """Return a list of columns."""
688 return self.alignment.columns
690 # vim: set ts=4 sts=4 sw=4 et: