Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/afed1fd8920c/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:19 2013
Кодировка:
allpy: afed1fd8920c allpy/base.py

allpy

view allpy/base.py @ 1091:afed1fd8920c

Added backreferences to `Seqeunce`s from `Monomer`s (closes #49) WARNING! Please note that `Sequence` API almost changed entirely! WARNING! This commit immediately obsoletes classmethods `Monomer.from_code*`, `Monomer.from_name` and `Sequence.from_monomers`. Turns out, python can not pickle sets/dicts which have keys, which inderecly reference the set/dict itself: http://bugs.python.org/issue9269 -- which is excatly what we have in abundance after this change. To allow pickling added `__getstate__` to `Monomer` to return all attributes, except `sequence` and `__setstate__` to `Sequence`, which runs through all monomers and returns the `sequence` attribute back to where it belongs. WARNING! This MAY result in unexpected behaviour in some cases. (Which should be rare enough).
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Sat, 02 Jun 2012 19:33:42 +0400
parents 4b1a6a7bbeea
children 41a167bbf150
line source
1 import sys
2 import re
4 import util
5 import fileio
6 import data.monomers
8 # import this very module as means of having all related classes in one place
9 import base
11 default_gaps = ("-", ".", "~")
12 """Set of characters to recoginze as gaps when parsing alignment."""
14 class Monomer(object):
15 """Monomer object."""
17 type = None
18 """Either of 'dna', 'rna', 'protein'."""
20 types = base
21 """Mapping of related types. SHOULD be redefined in subclasses."""
23 by_code1 = {}
24 """A mapping from 1-letter code to Monomer subclass."""
26 by_code3 = {}
27 """A mapping from 3-letter code to Monomer subclass."""
29 by_name = {}
30 """A mapping from full monomer name to Monomer subclass."""
32 sequence = None
33 """A sequence the monomer belongs to."""
35 @classmethod
36 def _subclass(cls, name='', code1='', code3='', is_modified=False):
37 """Create new subclass of Monomer for given monomer type."""
38 class TheMonomer(cls):
39 pass
40 name = name.strip().capitalize()
41 code1 = code1.upper()
42 code3 = code3.upper()
43 module = vars(data.monomers)[cls.type]
44 TheMonomer.__name__ = re.sub(r"\W", "_", name)
45 TheMonomer.__module__ = module.__name__
46 TheMonomer.name = name
47 TheMonomer.code1 = code1
48 TheMonomer.code3 = code3
49 TheMonomer.is_modified = is_modified
50 # Save the class in data.monomers so that it can be pickled
51 # Some names are not unique, we append underscores to them
52 # in order to fix it.
53 while TheMonomer.__name__ in vars(module):
54 TheMonomer.__name__ += "_"
55 vars(module)[TheMonomer.__name__] = TheMonomer
56 if not is_modified:
57 cls.by_code1[code1] = TheMonomer
58 if code3 not in cls.by_code3 or not is_modified:
59 cls.by_code3[code3] = TheMonomer
60 cls.by_name[name] = TheMonomer
61 # We duplicate distinguished long names into Monomer itself, so that we
62 # can use Monomer.from_code3 to create the relevant type of monomer.
63 if code3 not in Monomer.by_code3 or not is_modified:
64 Monomer.by_code3[code3] = TheMonomer
65 Monomer.by_name[name] = TheMonomer
67 @classmethod
68 def _initialize(cls, codes=None):
69 """Create all relevant subclasses of Monomer."""
70 for code1, is_modified, code3, name in codes:
71 cls._subclass(name, code1, code3, is_modified)
73 def __repr__(self):
74 return "<Monomer %s>" % str(self.code1)
76 def __str__(self):
77 """Returns one-letter code"""
78 return self.code1
80 def __eq__(self, other):
81 """Monomers within same monomer type are compared by code1."""
82 if not other:
83 return False
84 assert self.type == other.type
85 return self.code1 == other.code1
87 def __ne__(self, other):
88 return not (self == other)
90 def __getstate__(self):
91 """Overcome difficulties with pickle.
93 Pickle is unable to store `set`s/`dict`s that have objects referencing
94 back the `set`/`dict` itself, which `sequence` in monomer does.
95 ( http://bugs.python.org/issue9269 )
97 To sidestep the bug we store the monomer WITHOUT `sequence` attribute.
99 See also `Sequence.__setstate__`.
100 """
101 state = {}
102 state.update(vars(self))
103 if 'sequence' in state:
104 del state['sequence']
105 return state
107 def _obsolete_method(cls, *args, **kws):
108 """OBSOLETE"""
109 raise AttributeError("Call to obsolete method.")
110 from_code1 = classmethod(_obsolete_method)
111 from_code3 = classmethod(_obsolete_method)
112 from_name = classmethod(_obsolete_method)
114 class MarkupContainerMixin(object):
115 """Common functions for alignment and sequence for dealing with markups.
116 """
118 def _init(self):
119 """Hook to be called from __init__ of actual class."""
120 self.markups = {}
122 def add_markup(self, name, markup_class=None, use_existing=False, **kws):
123 """Create a markup object, add to self. Return the created markup.
125 - `name` is name for markup in `self.markups` dictionary
126 - optional `markup_class` is class for created markup
127 - if optional `use_existing` is true, it is no error, if same named
128 markup already exists (in this case, nothing is changed)
129 - optional keyword arguments are passed on to the markup constructor
131 For user markups you have to specify `name` and `markup_class`,
132 for the standard automatical markups just `name` is enough.
133 """
134 # We have to import markups here, and not in the module header
135 # so as not to create bad import loops.
136 # `base` module is used extensively in `markups` for inherinance,
137 # so breaking the loop here seems a lot easier.
138 import markups
139 if markup_class is None:
140 kind = self.kind + "_" + "markup"
141 markup_class = markups.by_name[kind, name]
142 if use_existing and name in self.markups:
143 assert self.markups[name].__class__ is markup_class
144 return self.markups[name]
145 assert name not in self.markups
146 markup = markup_class(self, name, caller='container', **kws)
147 self.markups[name] = markup
148 return markup
150 def remove_markup(self, name):
151 """Remove markup."""
152 self.markups[name].remove()
153 del self.markups[name]
155 class Sequence(list, MarkupContainerMixin):
156 """Sequence of Monomers.
158 This behaves like list of monomer objects. In addition to standard list
159 behaviour, Sequence has the following attributes:
161 * name -- str with the name of the sequence
162 * description -- str with description of the sequence
163 * source -- str denoting source of the sequence
165 Any of them may be empty (i.e. hold empty string)
166 """
168 types = base
169 """Mapping of related types. SHOULD be redefined in subclasses."""
171 kind = 'sequence'
172 """Description of object kind."""
174 name = ''
175 """Squence identifier."""
177 description = ''
178 """Detailed sequence description."""
180 source = ''
181 """Sequence source."""
183 def __init__(self, sequence=(), name='', description='', source=''):
184 list.__init__(self, sequence)
185 MarkupContainerMixin._init(self)
187 self.name = name
188 self.description = description
189 self.source = source
191 def append_monomer(self, code1=None, code3=None, name=None):
192 """Append a new monomer to the sequence. Return the new monomer."""
193 assert bool(code1) + bool(code3) + bool(name) == 1, \
194 "Please specify exactly one of: code1, code3, name"
195 if code1:
196 cls = self.types.Monomer.by_code1[code1.upper()]
197 elif code3:
198 cls = self.types.Monomer.by_code3[code3.upper()]
199 elif name:
200 cls = self.types.Monomer.by_name[name.strip().capitalize()]
201 monomer = cls()
202 monomer.sequence = self
203 monomer.input_code1 = code1
204 self.append(monomer)
205 return monomer
207 @classmethod
208 def from_string(cls, string, name='', description='', source=''):
209 """Create sequences from string of one-letter codes."""
210 self = cls([], name=name, description=description, source=source)
211 for letter in string:
212 self.append_monomer(code1=letter)
213 return self
215 def __repr__(self):
216 if self.name:
217 return '<Sequence %s>' % str(self.name)
218 else:
219 return '<Sequence %s>' % str(self)
221 def __str__(self):
222 """Returns sequence of one-letter codes."""
223 return ''.join(monomer.code1 for monomer in self)
225 def __hash__(self):
226 """Hash sequence by identity."""
227 return id(self)
229 def __setstate__(self, state):
230 """Overcome difficulties with pickle: add `monomer.sequence` after loading.
232 Pickle is unable to store `set`s/`dict`s that have objects referencing
233 back the `set`/`dict` itself, which `sequence` in monomer does.
234 ( http://bugs.python.org/issue9269 )
236 To sidestep the bug we store the monomer WITHOUT `sequence` attribute.
238 See also `Monomer.__getstate__`.
239 """
240 vars(self).update(state)
241 for monomer in self:
242 monomer.sequence = self
244 @classmethod
245 def from_monomers(cls, *args, **kws):
246 """OBSOLETE."""
247 raise AttributeError("Sequence.from_monomers is obsolete")
249 class Alignment(MarkupContainerMixin):
250 """Alignment. It is a list of Columns."""
252 types = base
253 """Mapping of related types. SHOULD be redefined in subclasses."""
255 sequences = None
256 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
258 kind = 'alignment'
259 """Description of object kind."""
261 def __init__(self):
262 """Initialize empty alignment."""
263 self.sequences = []
264 self.columns = []
265 MarkupContainerMixin._init(self)
267 # Alignment grow & IO methods
268 # ==============================
270 def append_sequence(self, sequence):
271 """Add sequence to alignment. Return self.
273 If sequence is too short, pad it with gaps on the right.
274 """
275 self.sequences.append(sequence)
276 self._pad_to_width(len(sequence))
277 for column, monomer in zip(self.columns, sequence):
278 column[sequence] = monomer
279 return self
281 def append_row_from_string(self, string,
282 name='', description='', source='', gaps=default_gaps):
283 """Add row from a string of one-letter codes and gaps. Return self."""
284 Sequence = self.types.Sequence
285 without_gaps = util.remove_each(string, gaps)
286 sequence = Sequence.from_string(without_gaps, name, description, source)
287 self._pad_to_width(len(string))
288 non_gap_columns = [column
289 for column, char in zip(self.columns, string)
290 if char not in gaps
292 for monomer, column in zip(sequence, non_gap_columns):
293 column[sequence] = monomer
294 self.sequences.append(sequence)
295 return self
297 def append_row_with_gaps(self, row, sequence):
298 """Add row from row_as_list representation and sequence. Return self."""
299 self.sequences.append(sequence)
300 self._pad_to_width(len(row))
301 for column, monomer in zip(self.columns, row):
302 if monomer:
303 column[sequence] = monomer
304 return self
306 def _append_columns(self, n, columns):
307 """Insert list of `columns` after position `n`."""
308 self.columns[n+1:n+1] = columns
310 def _pad_to_width(self, n):
311 """Pad alignment with empty columns on the right to width n."""
312 columns = [self.types.Column() for _ in range(len(self.columns), n)]
313 self._append_columns(len(self.columns)-1, columns)
315 def append_file(self, file, format='fasta', gaps=default_gaps):
316 """Append sequences from file to alignment. Return self.
318 If sequences in file have gaps (detected as characters belonging to
319 `gaps` set), treat them accordingly.
320 """
321 fileio.File(file, format, gaps=gaps).read_alignment(self)
322 return self
324 def to_file(self, file, format='fasta', gap='-'):
325 """Write alignment in FASTA file as sequences with gaps."""
326 fileio.File(file, format, gaps=gap).write_alignment(self)
327 return self
329 # Data access methods for alignment
330 # =================================
332 def rows(self):
333 """Return list of rows (temporary objects) in alignment.
335 Each row is a dictionary of { column : monomer }.
337 For gap positions there is no key for the column in row.
339 Each row has attribute `sequence` pointing to the sequence the row is
340 describing.
342 Modifications of row have no effect on the alignment.
343 """
344 # For now, the function returns a list rather than iterator.
345 # It is yet to see, whether memory performance here becomes critical,
346 # or is random access useful.
347 rows = []
348 for sequence in self.sequences:
349 row = util.UserDict()
350 row.sequence = sequence
351 for column in self.columns:
352 if sequence in column:
353 row[column] = column[sequence]
354 rows.append(row)
355 return rows
357 def rows_as_lists(self):
358 """Return list of rows (temporary objects) in alignment.
360 Each row here is a list of either monomer or None (for gaps).
362 Each row has attribute `sequence` pointing to the sequence of row.
364 Modifications of row have no effect on the alignment.
365 """
366 rows = []
367 for sequence in self.sequences:
368 row = util.UserList()
369 row.sequence = sequence
370 for column in self.columns:
371 row.append(column.get(sequence))
372 rows.append(row)
373 return rows
375 def rows_as_strings(self, gap='-'):
376 """Return list of string representation of rows in alignment.
378 Each row has attribute `sequence` pointing to the sequence of row.
380 `gap` is the symbol to use for gap.
381 """
382 rows = []
383 for sequence in self.sequences:
384 string = ""
385 for column in self.columns:
386 if sequence in column:
387 string += column[sequence].code1
388 else:
389 string += gap
390 string = util.UserString(string)
391 string.sequence = sequence
392 rows.append(string)
393 return rows
395 def row_as_list(self, sequence):
396 """Return representaion of row as list with `Monomers` and `None`s."""
397 return [column.get(sequence) for column in self.columns]
399 def row_as_string(self, sequence, gap='-'):
400 """Return string representaion of row in alignment.
402 String will have gaps represented by `gap` symbol (defaults to '-').
403 """
404 def char(monomer):
405 if monomer:
406 return monomer.code1
407 return gap
408 row = self.row_as_list(sequence)
409 return "".join(map(char, row))
411 def columns_as_lists(self):
412 """Return list of columns (temorary objects) in alignment.
414 Each column here is a list of either monomer or None (for gaps).
416 Items of column are sorted in the same way as alignment.sequences.
418 Modifications of column have no effect on the alignment.
419 """
420 columns = []
421 for column in self.columns:
422 col = util.UserList()
423 col.column = column
424 for sequence in self.sequences:
425 col.append(column.get(sequence))
426 columns.append(col)
427 return columns
429 # Alignment / Block editing methods
430 # =================================
432 def flush(self, whence='left'):
433 """Remove all gaps from alignment and flush results to one side.
435 `whence` must be one of 'left', 'right' or 'center'
436 """
437 deprecated(
438 "aln.flush('left') is deprecated in favor of aln.realign(Left())"
440 if whence == 'left':
441 from processors import Left as Flush
442 elif whence == 'right':
443 from processors import Right as Flush
444 elif whence == 'center':
445 from processors import Center as Flush
446 else:
447 raise AssertionError, "Whence must be left, right or center"
448 self.realign(Flush())
450 def remove_gap_columns(self):
451 """Remove all empty columns."""
452 for n, column in reversed(list(enumerate(self.columns))):
453 if not any(seq in column for seq in self.sequences):
454 self.columns[n:n+1] = []
456 def _wipe_row(self, sequence):
457 """Turn all row positions into gaps (but keep sequences intact)."""
458 for column in self.columns:
459 if sequence in column:
460 del column[sequence]
462 def _replace_column_contents(self, new):
463 """Replace column contents with those of `new` alignment.
465 In other words: copy gap patterns from `new` to `self`.
467 `self.sequences` and `new.sequences` should have the same contents.
468 """
469 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):
470 sequence = row.sequence
471 monomers = filter(None, row)
472 assert len(monomers) == len(filter(None, new_row))
473 self._wipe_row(sequence)
474 self._pad_to_width(len(new_row))
475 non_gap_columns = [column
476 for column, monomer in zip(self.columns, new_row)
477 if monomer
479 assert len(monomers) == len(non_gap_columns)
480 for monomer, column in zip(monomers, non_gap_columns):
481 column[sequence] = monomer
483 def realign(self, function):
484 """Realign self.
486 * apply function to self to produce a new alignment,
487 * update self to have the same gap patterns as the new alignment.
488 """
489 new = function(self)
490 self._replace_column_contents(new)
492 class Column(dict):
493 """Column of alignment.
495 Column is a dict of { sequence : monomer }.
497 For sequences that have gaps in current row, given key is not present in
498 the column.
499 """
501 types = base
502 """Mapping of related types. SHOULD be redefined in subclasses."""
504 def __hash__(self):
505 """Return hash by identity."""
506 return id(self)
509 class Block(Alignment):
510 """Block of alignment.
512 Block is an intersection of several rows & columns. (The collections of
513 rows and columns are represented as ordered lists, to retain display order
514 of Alignment or add ability to tweak it). Most of blocks look like
515 rectangular part of alignment if you shuffle alignment rows the right way.
516 """
518 alignment = None
519 """Alignment the block belongs to."""
521 sequences = ()
522 """List of sequences in block."""
524 columns = ()
525 """List of columns in block."""
527 @classmethod
528 def from_alignment(cls, alignment, sequences=None, columns=None):
529 """Build new block from alignment.
531 If sequences are not given, the block uses all sequences in alignment.
533 If columns are not given, the block uses all columns in alignment.
535 In both cases we use exactly the list used in alignment, thus, if new
536 sequences or columns are added to alignment, the block tracks this too.
537 """
538 if sequences is None:
539 sequences = alignment.sequences
540 if columns is None:
541 columns = alignment.columns
542 block = cls()
543 block.alignment = alignment
544 block.sequences = sequences
545 block.columns = columns
546 return block
548 def _append_columns(self, n, columns):
549 """Insert list of `columns` after position `n`."""
550 assert len(self.columns) != 0, "Can't append columns to an empty Block"
551 target = self.columns[n]
552 for k, column in enumerate(self.alignment.columns):
553 if column is target:
554 me = k
555 if self.columns is not self.alignment.columns:
556 self.alignment._append_columns(me, columns)
557 self.columns[n+1:n+1] = columns
559 class Markup(object):
560 """Base class for sequence and alignment markups.
562 We shall call either sequence or alignment a container. And we shall call
563 either monomers or columns elements respectively.
565 Markup behaves like a dictionary of [element] -> value.
567 Every container has a dictionary of [name] -> markup. It is Markup's
568 responsibility to add itself to this dictionary and to avoid collisions
569 while doing it.
570 """
572 name = None
573 """Name of markup elements."""
575 save = True
576 """If set to false, fileio should not save this markup."""
578 def __init__(self, container, name, **kwargs):
579 """Markup takes mandatory container and name and optional kwargs.
581 Markups should never be created by the user. They are created by
582 Sequence or Alignment.
583 """
584 self.name = name
585 assert kwargs.get('caller') == 'container', "Improper call"
586 self.refresh()
588 def refresh(self):
589 """Recalculate markup values (if they are generated automatically)."""
590 pass
592 def remove(self):
593 """Remove the traces of markup object. Do not call this yourself!"""
594 pass
596 @classmethod
597 def from_record(cls, container, record, name=None):
598 """Restore markup from `record`. (Used for loading from file).
600 `record` is a dict of all metadata and data related to one markup. All
601 keys and values in `record` are strings, markup must parse them itself.
603 Markup values should be stored in `record['markup']`, which is a list
604 of items separated with either `record['separator']` or a comma.
605 """
606 return container.add_markup(name, markup_class=cls)
608 def to_record(self):
609 """Save markup to `record`, for saving to file.
611 For description of `record` see docstring for `from_record` method.
612 """
613 return {}
615 def sorted_keys(self):
616 """Return list of elements in the container in proper order."""
617 raise NotImplementedError()
619 def sorted_values(self, **kw):
620 """Return list of markup values in container.
622 Possible arguments:
624 - `map` -- a function, applied to each existing value
625 - `default` -- a value to return for non-existing values
627 If `default` is not specified, the function fails on markups that do
628 not have all of the values set.
629 """
630 default_exists = 'default' in kw
631 default = kw.get('default')
632 map = kw.get('map', lambda x: x)
633 for item in self.sorted_keys():
634 if item not in self and default_exists:
635 yield default
636 else:
637 yield map(self[item])
639 class SequenceMarkup(Markup):
640 """Markup for sequence.
642 Behaves like a dictionary of [monomer] -> value. Value may be anything
643 or something specific, depending on subclass.
645 Actual values are stored in monomers themselves as attributes.
646 """
648 kind = 'sequence_markup'
650 def __init__(self, sequence, name, **kwargs):
651 self.sequence = sequence
652 Markup.__init__(self, sequence, name, **kwargs)
654 def remove(self):
655 """Remove the traces of markup object. Do not call this yourself!"""
656 for monomer in self.sequence:
657 del self[monomer]
659 def sorted_keys(self):
660 """Return list of monomers."""
661 return self.sequence
663 def get(self, key, value=None):
664 """Part of Mapping collection interface."""
665 if key not in self:
666 return value
667 return self[key]
669 def __contains__(self, monomer):
670 """Part of Mapping collection interface."""
671 return hasattr(monomer, self.name)
673 def __getitem__(self, monomer):
674 """Part of Mapping collection interface."""
675 return getattr(monomer, self.name)
677 def __setitem__(self, monomer, value):
678 """Part of Mapping collection interface."""
679 return setattr(monomer, self.name, value)
681 def __delitem__(self, monomer):
682 """Part of Mapping collection interface."""
683 return delattr(monomer, self.name)
685 class AlignmentMarkup(dict, Markup):
686 """Markupf for alignment.
688 Is a dictionary of [column] -> value. Value may be anything or something
689 specific, depending on subclass.
690 """
692 kind = 'alignment_markup'
694 def __init__(self, alignment, name, **kwargs):
695 self.alignment = alignment
696 Markup.__init__(self, alignment, name, **kwargs)
698 def sorted_keys(self):
699 """Return a list of columns."""
700 return self.alignment.columns
702 # vim: set ts=4 sts=4 sw=4 et: