allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 378:dd94230c6f08

fixed bugs; usecase1.py works (see #23) (see #1)

author	boris <bnagaev@gmail.com>
date	Tue, 01 Feb 2011 17:11:33 +0300
parents	166806efc570
children	5639138f619a

rev	line source
me@261	1 import sys
bnagaev@357	2 import re
me@261	3
me@315	4 import util
me@284	5 import fasta
me@260	6
me@306	7 default_gaps = set((".", "-", "~"))
me@306	8 """Set of characters to recoginze as gaps when parsing alignment."""
me@306	9
me@328	10 class Monomer(object):
me@328	11 """Monomer object."""
me@260	12
me@328	13 type = None
me@328	14 """Either of 'dna', 'rna', 'protein'."""
me@260	15
me@260	16 by_code1 = {}
me@328	17 """A mapping from 1-letter code to Monomer subclass."""
me@328	18
me@260	19 by_code3 = {}
me@328	20 """A mapping from 3-letter code to Monomer subclass."""
me@328	21
me@260	22 by_name = {}
me@328	23 """A mapping from full monomer name to Monomer subclass."""
me@260	24
me@260	25 @classmethod
me@328	26 def _subclass(cls, name='', code1='', code3='', is_modified=False):
me@328	27 """Create new subclass of Monomer for given monomer type."""
me@328	28 class TheMonomer(cls):
me@328	29 pass
me@328	30 name = name.strip().capitalize()
me@328	31 code1 = code1.upper()
me@328	32 code3 = code3.upper()
bnagaev@357	33 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
me@328	34 TheMonomer.name = name
me@328	35 TheMonomer.code1 = code1
me@328	36 TheMonomer.code3 = code3
me@328	37 TheMonomer.is_modified = is_modified
me@328	38 if not is_modified:
me@328	39 cls.by_code1[code1] = TheMonomer
me@328	40 cls.by_code3[code3] = TheMonomer
me@328	41 cls.by_name[name] = TheMonomer
me@328	42 # We duplicate distinguished long names into Monomer itself, so that we
me@328	43 # can use Monomer.from_code3 to create the relevant type of monomer.
me@328	44 Monomer.by_code3[code3] = TheMonomer
me@328	45 Monomer.by_name[name] = TheMonomer
me@260	46
me@328	47 @classmethod
me@353	48 def _initialize(cls, codes=None):
me@328	49 """Create all relevant subclasses of Monomer."""
me@328	50 # NB. The table uses letters d, r, p for types,
me@328	51 # while we use full words; hence, we compare by first letter
bnagaev@378	52 for code1, is_modified, code3, name in codes:
bnagaev@378	53 cls._subclass(name, code1, code3, is_modified)
me@260	54
me@260	55 @classmethod
me@260	56 def from_code1(cls, code1):
me@328	57 """Create new monomer from 1-letter code."""
me@328	58 return cls.by_code1[code1.upper()]()
me@260	59
me@260	60 @classmethod
me@260	61 def from_code3(cls, code3):
me@328	62 """Create new monomer from 3-letter code."""
me@328	63 return cls.by_code3[code3.upper()]()
me@260	64
me@260	65 @classmethod
me@260	66 def from_name(cls, name):
me@328	67 """Create new monomer from full name."""
me@328	68 return cls.by_name[name.strip().capitalize()]()
me@260	69
me@329	70 def __repr__(self):
me@329	71 return '<Monomer %s>' % self.code3
me@329	72
me@329	73 def __str__(self):
me@329	74 """Returns one-letter code"""
me@329	75 return self.code1
me@329	76
me@260	77 def __eq__(self, other):
me@328	78 """Monomers within same monomer type are compared by code1."""
me@328	79 assert self.type == other.type
me@328	80 return self.code1 == other.code1
bnagaev@239	81
bnagaev@239	82 class Sequence(list):
me@274	83 """Sequence of Monomers.
bnagaev@243	84
me@274	85 This behaves like list of monomer objects. In addition to standard list
me@274	86 behaviour, Sequence has the following attributes:
me@270	87
me@274	88 * name -- str with the name of the sequence
me@274	89 * description -- str with description of the sequence
me@274	90 * source -- str denoting source of the sequence
me@266	91
me@274	92 Any of them may be empty (i.e. hold empty string)
me@275	93
me@275	94 Class attributes:
me@282	95
me@275	96 * monomer_type -- type of monomers in sequence, must be redefined when
me@275	97 subclassing
me@274	98 """
me@270	99
me@275	100 monomer_type = Monomer
me@270	101
me@275	102 name = ''
me@275	103 description = ''
me@275	104 source = ''
me@275	105
me@347	106 @classmethod
me@347	107 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
me@347	108 """Create sequence from a list of monomer objecst."""
bnagaev@378	109 result = cls(monomers)
me@275	110 if name:
me@347	111 result.name = name
me@275	112 if description:
me@347	113 result.description = description
me@275	114 if source:
me@347	115 result.source = source
me@347	116 return result
me@347	117
me@347	118 @classmethod
me@347	119 def from_string(cls, string, name='', description='', source=''):
me@347	120 """Create sequences from string of one-letter codes."""
me@347	121 monomer = cls.monomer_type.from_code1
me@347	122 monomers = [monomer(letter) for letter in string]
me@347	123 return cls.from_monomers(monomers, name, description, source)
me@270	124
me@329	125 def __repr__(self):
me@329	126 return '<Sequence %s>' % str(self)
me@329	127
me@262	128 def __str__(self):
me@329	129 """Returns sequence of one-letter codes."""
me@275	130 return ''.join(monomer.code1 for monomer in self)
me@270	131
me@316	132 def __hash__(self):
me@316	133 """Hash sequence by identity."""
me@316	134 return id(self)
me@316	135
me@295	136 class Alignment(object):
me@295	137 """Alignment. It is a list of Columns."""
bnagaev@249	138
me@287	139 sequence_type = Sequence
me@289	140 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
me@288	141
me@289	142 sequences = None
me@289	143 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249	144
me@287	145 def __init__(self):
me@287	146 """Initialize empty alignment."""
me@287	147 self.sequences = []
me@295	148 self.columns = []
me@282	149
me@362	150 # Alignment grow & IO methods
me@299	151 # ==============================
me@299	152
me@294	153 def append_sequence(self, sequence):
me@365	154 """Add sequence to alignment. Return self.
me@294	155
me@294	156 If sequence is too short, pad it with gaps on the right.
me@294	157 """
me@294	158 self.sequences.append(sequence)
me@294	159 for i, monomer in enumerate(sequence):
me@366	160 self._column_at(i)[sequence] = monomer
me@365	161 return self
me@294	162
me@364	163 def append_row_from_string(self, string,
me@364	164 name='', description='', source='', gaps=default_gaps):
me@364	165 """Add row from a string of one-letter codes and gaps. Return self."""
me@313	166 Sequence = self.sequence_type
me@306	167 not_gap = lambda (i, char): char not in gaps
me@349	168 without_gaps = util.remove_each(string, gaps)
me@321	169 sequence = Sequence.from_string(without_gaps, name, description, source)
me@303	170 # The following line has some simple magic:
me@303	171 # 1. attach natural numbers to monomers
me@303	172 # 2. delete gaps
me@303	173 # 3. attach numbers again
me@303	174 # This way we have a pair of numbers attached to monomer:
me@303	175 # - it's position in alignment (the first attached number, j)
me@303	176 # - it's position in sequence (the second attached number, i)
me@349	177 for i, (j, char) in enumerate(filter(not_gap, enumerate(string))):
me@366	178 self._column_at(j)[sequence] = sequence[i]
me@287	179 self.sequences.append(sequence)
me@364	180 return self
me@287	181
me@366	182 def _column_at(self, n):
me@366	183 """Return column by index. Create new columns if required."""
me@302	184 for i in range(len(self.columns), n + 1):
me@302	185 self.columns.append(Column())
me@302	186 return self.columns[n]
me@302	187
me@362	188 def append_file(self, file, format='fasta', gaps=default_gaps):
me@365	189 """Append sequences from file to alignment. Return self.
me@299	190
me@362	191 If sequences in file have gaps (detected as characters belonging to
me@362	192 `gaps` set), treat them accordingly.
me@362	193 """
me@367	194 assert format == 'fasta', "We don't support other formats yet"
me@313	195 for (name, description, body) in fasta.parse_file(file):
bnagaev@378	196 self.append_row_from_string(body, name, description, file.name, gaps)
me@287	197 return self
bnagaev@249	198
me@367	199 def to_file(self, file, format='fasta'):
me@292	200 """Write alignment in FASTA file as sequences with gaps."""
me@367	201 assert format == "fasta", "We don't support other formats yet"
me@292	202 def char(monomer):
me@292	203 if monomer:
me@292	204 return monomer.code1
me@292	205 return "-"
me@292	206 for row in self.rows_as_lists():
me@292	207 seq = row.sequence
me@292	208 line = "".join(map(char, row))
me@292	209 fasta.save_file(file, line, seq.name, seq.description)
me@292	210
me@299	211 # Data access methods for alignment
me@299	212 # =================================
me@299	213
me@299	214 def rows(self):
me@299	215 """Return list of rows (temporary objects) in alignment.
me@299	216
me@299	217 Each row is a dictionary of { column : monomer }.
me@363	218
me@299	219 For gap positions there is no key for the column in row.
me@299	220
me@299	221 Each row has attribute `sequence` pointing to the sequence the row is
me@299	222 describing.
me@299	223
me@299	224 Modifications of row have no effect on the alignment.
me@299	225 """
me@299	226 # For now, the function returns a list rather than iterator.
me@299	227 # It is yet to see, whether memory performance here becomes critical,
me@299	228 # or is random access useful.
me@299	229 rows = []
me@299	230 for sequence in self.sequences:
me@299	231 row = util.UserDict()
me@299	232 row.sequence = sequence
me@299	233 for column in self.columns:
me@299	234 if sequence in column:
me@299	235 row[column] = column[sequence]
me@299	236 rows.append(row)
me@299	237 return rows
me@299	238
me@299	239 def rows_as_lists(self):
me@299	240 """Return list of rows (temporary objects) in alignment.
me@299	241
me@299	242 Each row here is a list of either monomer or None (for gaps).
me@299	243
me@299	244 Each row has attribute `sequence` pointing to the sequence of row.
me@299	245
me@299	246 Modifications of row have no effect on the alignment.
me@299	247 """
me@299	248 rows = []
me@299	249 for sequence in self.sequences:
me@299	250 row = util.UserList()
me@299	251 row.sequence = sequence
me@299	252 for column in self.columns:
me@299	253 row.append(column.get(sequence))
me@299	254 rows.append(row)
me@299	255 return rows
me@299	256
me@299	257 def columns_as_lists(self):
me@299	258 """Return list of columns (temorary objects) in alignment.
me@299	259
me@299	260 Each column here is a list of either monomer or None (for gaps).
me@299	261
me@299	262 Items of column are sorted in the same way as alignment.sequences.
me@299	263
me@299	264 Modifications of column have no effect on the alignment.
me@299	265 """
me@299	266 columns = []
me@299	267 for column in self.columns:
me@299	268 col = []
me@299	269 for sequence in self.sequences:
me@299	270 col.append(column.get(sequence))
me@299	271 columns.append(col)
me@299	272 return columns
me@299	273
me@368	274 # Alignment / Block editing methods
me@368	275 # =================================
me@368	276
me@368	277 def _flush_row(self, row, whence='left'):
me@368	278 """Helper for `flush`: flush to one side all monomers in one row."""
me@368	279 row = filter(None, row)
me@368	280 padding = [None] * len(self.columns)
me@368	281 if whence == 'left':
me@368	282 return row + padding
me@368	283 if whence == 'right':
me@368	284 return padding + row
me@368	285 if whence == 'center':
me@368	286 pad_len = (len(self.columns) - len(row)) // 2
me@368	287 # vvv fix padding for case when length is odd: better have more
me@368	288 pad_len += len(self.columns) - 2 * pad_len
me@368	289 padding = [None] * pad_len
me@368	290 return padding + row + padding
me@368	291 assert True, "whence must be either 'left' or 'right' or 'center'"
me@368	292
me@368	293 def flush(self, whence='left'):
me@368	294 """Remove all gaps from alignment and flush results to one side.
me@368	295
me@368	296 `whence` must be one of 'left', 'right' or 'center'
me@368	297 """
me@368	298 for row in self.rows_as_lists():
me@368	299 sequence = row.sequence
me@368	300 row = self._flush_row(row, whence)
me@368	301 for monomer, column in zip(row, self.columns):
me@368	302 if monomer:
me@368	303 column[sequence] = monomer
me@368	304 elif sequence in column:
me@368	305 del column[sequence]
me@368	306
me@369	307 def remove_gap_columns(self):
me@369	308 """Remove all empty columns."""
me@369	309 for n, column in reversed(enumerate(self.columns)):
me@369	310 if column == {}:
me@369	311 self.columns[n:n+1] = []
me@369	312
me@371	313 def _wipe(self):
me@371	314 """Make all positions gaps (but keep sequences intact)."""
me@371	315 for column in self.columns:
bnagaev@378	316 for sequence in list(column.keys()):
me@371	317 del column[sequence]
me@371	318
me@372	319 def _merge(self, dst, new, merge):
me@373	320 """Replace contents of `dst` with those of `new`.
me@372	321
me@372	322 Replace contents of elements using function `merge(dst_el, new_le)`.
me@372	323 """
me@372	324 for el, new_el in zip(dst, new):
me@372	325 merge(el, new_el)
me@372	326 dst[len(dst):] = new[len(dst):]
me@372	327 del dst[len(new):]
me@371	328
me@373	329 def _replace_sequence_contents(self, new, copy_descriptions):
me@373	330 """Replace contents of sequences with those of `new` alignment."""
me@371	331 # XXX: we manually copy sequence contents here
me@372	332 # XXX: we only copy, overlapping parts and link to the rest
me@372	333 def merge_monomers(dst, new):
me@372	334 dst.__class__ = new.__class__
me@372	335 def merge_sequences(dst, new):
me@373	336 if copy_descriptions:
me@373	337 vars(dst).update(vars(new))
me@372	338 self._merge(dst, new, merge_monomers)
me@372	339 self._merge(self.sequences, new.sequences, merge_sequences)
me@371	340
me@371	341 def _replace_column_contents(self, new):
me@373	342 """Replace column contents with those of `new` alignment.
me@371	343
me@373	344 Synonym: copy gap patterns from `new` to `self`.
me@372	345
me@373	346 `self.sequences` and `new.sequences` should have the same contents.
me@371	347 """
me@371	348 self._wipe()
me@371	349 not_gap = lambda (a,b): a != None
me@371	350 for sequence, new_row in zip(self.sequences, new.rows_as_lists()):
me@371	351 assert len(sequence) == len(new_row.sequence)
me@371	352 zipped = zip(sequence, filter(not_gap, enumerate(new_row)))
me@371	353 for monomer, (i, _) in zipped:
me@371	354 self._column_at(i)[sequence] = monomer
me@371	355
me@373	356 def _replace_contents(self, new, copy_descriptions, copy_contents):
me@371	357 """Replace alignment contents with those of other alignment."""
me@373	358 if copy_contents:
me@373	359 self._replace_sequence_contents(new, copy_descriptions)
bnagaev@378	360 self._replace_column_contents(new)
me@371	361
me@373	362 def process(self, function, copy_descriptions=True, copy_contents=True):
me@371	363 """Apply function to the alignment (or block); inject results back.
me@371	364
me@373	365 - `function(block)` must return block with same line order.
me@373	366 - if `copy_descriptions` is False, ignore new sequence names.
me@373	367 - if `copy_contents` is False, don't copy sequence contents too.
me@371	368 """
me@371	369 new = function(self)
me@373	370 self._replace_contents(new, copy_descriptions, copy_contents)
me@371	371
me@300	372 class Column(dict):
me@300	373 """Column of alignment.
me@300	374
me@300	375 Column is a dict of { sequence : monomer }.
me@300	376
me@300	377 For sequences that have gaps in current row, given key is not present in
me@300	378 the column.
me@300	379 """
me@325	380
me@325	381 def __hash__(self):
me@325	382 """Return hash by identity."""
me@325	383 return id(self)
me@300	384
me@317	385 class Block(Alignment):
me@307	386 """Block of alignment.
me@301	387
me@307	388 Block is intersection of a set of columns & a set of rows. Most of blocks
me@307	389 look like rectangular part of alignment if you shuffle alignment rows the
me@307	390 right way.
me@261	391 """
me@270	392
me@307	393 alignment = None
me@307	394 """Alignment the block belongs to."""
me@270	395
me@307	396 sequences = ()
me@307	397 """List of sequences in block."""
me@307	398
me@307	399 columns = ()
me@307	400 """List of columns in block."""
me@307	401
me@317	402 @classmethod
me@317	403 def from_alignment(cls, alignment, sequences=None, columns=None):
me@307	404 """Build new block from alignment.
me@307	405
me@307	406 If sequences are not given, the block uses all sequences in alignment.
me@307	407
me@307	408 If columns are not given, the block uses all columns in alignment.
me@307	409
me@307	410 In both cases we use exactly the list used in alignment, thus, if new
me@307	411 sequences or columns are added to alignment, the block tracks this too.
me@261	412 """
me@307	413 if sequences is None:
me@307	414 sequences = alignment.sequences
me@318	415 if columns is None:
me@307	416 columns = alignment.columns
me@320	417 block = cls()
me@320	418 block.alignment = alignment
me@320	419 block.sequences = sequences
me@320	420 block.columns = columns
me@320	421 return block
me@270	422
me@260	423 # vim: set ts=4 sts=4 sw=4 et: