Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/d66d470a9c0a/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:04:45 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 381:d66d470a9c0a

Code cleanup in base.Alignment._merge helper for base.Alignment.process
author Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date Tue, 01 Feb 2011 18:46:23 +0300
parents 7b6907d6d927
children df571a5233fb
rev   line source
me@261 1 import sys
bnagaev@357 2 import re
me@261 3
me@315 4 import util
me@284 5 import fasta
me@260 6
me@306 7 default_gaps = set((".", "-", "~"))
me@306 8 """Set of characters to recoginze as gaps when parsing alignment."""
me@306 9
me@328 10 class Monomer(object):
me@328 11 """Monomer object."""
me@260 12
me@328 13 type = None
me@328 14 """Either of 'dna', 'rna', 'protein'."""
me@260 15
me@260 16 by_code1 = {}
me@328 17 """A mapping from 1-letter code to Monomer subclass."""
me@328 18
me@260 19 by_code3 = {}
me@328 20 """A mapping from 3-letter code to Monomer subclass."""
me@328 21
me@260 22 by_name = {}
me@328 23 """A mapping from full monomer name to Monomer subclass."""
me@260 24
me@260 25 @classmethod
me@328 26 def _subclass(cls, name='', code1='', code3='', is_modified=False):
me@328 27 """Create new subclass of Monomer for given monomer type."""
me@328 28 class TheMonomer(cls):
me@328 29 pass
me@328 30 name = name.strip().capitalize()
me@328 31 code1 = code1.upper()
me@328 32 code3 = code3.upper()
bnagaev@357 33 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
me@328 34 TheMonomer.name = name
me@328 35 TheMonomer.code1 = code1
me@328 36 TheMonomer.code3 = code3
me@328 37 TheMonomer.is_modified = is_modified
me@328 38 if not is_modified:
me@328 39 cls.by_code1[code1] = TheMonomer
me@328 40 cls.by_code3[code3] = TheMonomer
me@328 41 cls.by_name[name] = TheMonomer
me@328 42 # We duplicate distinguished long names into Monomer itself, so that we
me@328 43 # can use Monomer.from_code3 to create the relevant type of monomer.
me@328 44 Monomer.by_code3[code3] = TheMonomer
me@328 45 Monomer.by_name[name] = TheMonomer
me@260 46
me@328 47 @classmethod
me@353 48 def _initialize(cls, codes=None):
me@328 49 """Create all relevant subclasses of Monomer."""
me@328 50 # NB. The table uses letters d, r, p for types,
me@328 51 # while we use full words; hence, we compare by first letter
bnagaev@378 52 for code1, is_modified, code3, name in codes:
bnagaev@378 53 cls._subclass(name, code1, code3, is_modified)
me@260 54
me@260 55 @classmethod
me@260 56 def from_code1(cls, code1):
me@328 57 """Create new monomer from 1-letter code."""
me@328 58 return cls.by_code1[code1.upper()]()
me@260 59
me@260 60 @classmethod
me@260 61 def from_code3(cls, code3):
me@328 62 """Create new monomer from 3-letter code."""
me@328 63 return cls.by_code3[code3.upper()]()
me@260 64
me@260 65 @classmethod
me@260 66 def from_name(cls, name):
me@328 67 """Create new monomer from full name."""
me@328 68 return cls.by_name[name.strip().capitalize()]()
me@260 69
me@329 70 def __repr__(self):
me@329 71 return '<Monomer %s>' % self.code3
me@329 72
me@329 73 def __str__(self):
me@329 74 """Returns one-letter code"""
me@329 75 return self.code1
me@329 76
me@260 77 def __eq__(self, other):
me@328 78 """Monomers within same monomer type are compared by code1."""
me@328 79 assert self.type == other.type
me@328 80 return self.code1 == other.code1
bnagaev@239 81
bnagaev@239 82 class Sequence(list):
me@274 83 """Sequence of Monomers.
bnagaev@243 84
me@274 85 This behaves like list of monomer objects. In addition to standard list
me@274 86 behaviour, Sequence has the following attributes:
me@270 87
me@274 88 * name -- str with the name of the sequence
me@274 89 * description -- str with description of the sequence
me@274 90 * source -- str denoting source of the sequence
me@266 91
me@274 92 Any of them may be empty (i.e. hold empty string)
me@275 93
me@275 94 Class attributes:
me@282 95
me@275 96 * monomer_type -- type of monomers in sequence, must be redefined when
me@275 97 subclassing
me@274 98 """
me@270 99
me@275 100 monomer_type = Monomer
me@270 101
me@275 102 name = ''
me@275 103 description = ''
me@275 104 source = ''
me@275 105
me@347 106 @classmethod
me@347 107 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
me@347 108 """Create sequence from a list of monomer objecst."""
bnagaev@378 109 result = cls(monomers)
me@275 110 if name:
me@347 111 result.name = name
me@275 112 if description:
me@347 113 result.description = description
me@275 114 if source:
me@347 115 result.source = source
me@347 116 return result
me@347 117
me@347 118 @classmethod
me@347 119 def from_string(cls, string, name='', description='', source=''):
me@347 120 """Create sequences from string of one-letter codes."""
me@347 121 monomer = cls.monomer_type.from_code1
me@347 122 monomers = [monomer(letter) for letter in string]
me@347 123 return cls.from_monomers(monomers, name, description, source)
me@270 124
me@329 125 def __repr__(self):
me@329 126 return '<Sequence %s>' % str(self)
me@329 127
me@262 128 def __str__(self):
me@329 129 """Returns sequence of one-letter codes."""
me@275 130 return ''.join(monomer.code1 for monomer in self)
me@270 131
me@316 132 def __hash__(self):
me@316 133 """Hash sequence by identity."""
me@316 134 return id(self)
me@316 135
me@295 136 class Alignment(object):
me@295 137 """Alignment. It is a list of Columns."""
bnagaev@249 138
dendik@379 139 alignment_type = None
dendik@379 140 """Related Alignment class. SHOULD be redefined when subclassing."""
dendik@379 141 # Actually given value at EOF, since name Alignment is not yet defined
dendik@379 142
dendik@379 143 block_type = None
dendik@379 144 """Related Block class. SHOULD be redefined when subclassing."""
dendik@379 145 # Actually given value at EOF, since name Block is not yet defined
dendik@379 146
me@287 147 sequence_type = Sequence
me@289 148 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
me@288 149
me@289 150 sequences = None
me@289 151 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249 152
me@287 153 def __init__(self):
me@287 154 """Initialize empty alignment."""
me@287 155 self.sequences = []
me@295 156 self.columns = []
me@282 157
me@362 158 # Alignment grow & IO methods
me@299 159 # ==============================
me@299 160
me@294 161 def append_sequence(self, sequence):
me@365 162 """Add sequence to alignment. Return self.
me@294 163
me@294 164 If sequence is too short, pad it with gaps on the right.
me@294 165 """
me@294 166 self.sequences.append(sequence)
me@294 167 for i, monomer in enumerate(sequence):
me@366 168 self._column_at(i)[sequence] = monomer
me@365 169 return self
me@294 170
me@364 171 def append_row_from_string(self, string,
me@364 172 name='', description='', source='', gaps=default_gaps):
me@364 173 """Add row from a string of one-letter codes and gaps. Return self."""
me@313 174 Sequence = self.sequence_type
me@306 175 not_gap = lambda (i, char): char not in gaps
me@349 176 without_gaps = util.remove_each(string, gaps)
me@321 177 sequence = Sequence.from_string(without_gaps, name, description, source)
me@303 178 # The following line has some simple magic:
me@303 179 # 1. attach natural numbers to monomers
me@303 180 # 2. delete gaps
me@303 181 # 3. attach numbers again
me@303 182 # This way we have a pair of numbers attached to monomer:
me@303 183 # - it's position in alignment (the first attached number, j)
me@303 184 # - it's position in sequence (the second attached number, i)
me@349 185 for i, (j, char) in enumerate(filter(not_gap, enumerate(string))):
me@366 186 self._column_at(j)[sequence] = sequence[i]
me@287 187 self.sequences.append(sequence)
me@364 188 return self
me@287 189
me@366 190 def _column_at(self, n):
me@366 191 """Return column by index. Create new columns if required."""
me@302 192 for i in range(len(self.columns), n + 1):
me@302 193 self.columns.append(Column())
me@302 194 return self.columns[n]
me@302 195
me@362 196 def append_file(self, file, format='fasta', gaps=default_gaps):
me@365 197 """Append sequences from file to alignment. Return self.
me@299 198
me@362 199 If sequences in file have gaps (detected as characters belonging to
me@362 200 `gaps` set), treat them accordingly.
me@362 201 """
me@367 202 assert format == 'fasta', "We don't support other formats yet"
me@313 203 for (name, description, body) in fasta.parse_file(file):
bnagaev@378 204 self.append_row_from_string(body, name, description, file.name, gaps)
me@287 205 return self
bnagaev@249 206
me@367 207 def to_file(self, file, format='fasta'):
me@292 208 """Write alignment in FASTA file as sequences with gaps."""
me@367 209 assert format == "fasta", "We don't support other formats yet"
me@292 210 def char(monomer):
me@292 211 if monomer:
me@292 212 return monomer.code1
me@292 213 return "-"
me@292 214 for row in self.rows_as_lists():
me@292 215 seq = row.sequence
me@292 216 line = "".join(map(char, row))
me@292 217 fasta.save_file(file, line, seq.name, seq.description)
me@292 218
me@299 219 # Data access methods for alignment
me@299 220 # =================================
me@299 221
me@299 222 def rows(self):
me@299 223 """Return list of rows (temporary objects) in alignment.
me@299 224
me@299 225 Each row is a dictionary of { column : monomer }.
me@363 226
me@299 227 For gap positions there is no key for the column in row.
me@299 228
me@299 229 Each row has attribute `sequence` pointing to the sequence the row is
me@299 230 describing.
me@299 231
me@299 232 Modifications of row have no effect on the alignment.
me@299 233 """
me@299 234 # For now, the function returns a list rather than iterator.
me@299 235 # It is yet to see, whether memory performance here becomes critical,
me@299 236 # or is random access useful.
me@299 237 rows = []
me@299 238 for sequence in self.sequences:
me@299 239 row = util.UserDict()
me@299 240 row.sequence = sequence
me@299 241 for column in self.columns:
me@299 242 if sequence in column:
me@299 243 row[column] = column[sequence]
me@299 244 rows.append(row)
me@299 245 return rows
me@299 246
me@299 247 def rows_as_lists(self):
me@299 248 """Return list of rows (temporary objects) in alignment.
me@299 249
me@299 250 Each row here is a list of either monomer or None (for gaps).
me@299 251
me@299 252 Each row has attribute `sequence` pointing to the sequence of row.
me@299 253
me@299 254 Modifications of row have no effect on the alignment.
me@299 255 """
me@299 256 rows = []
me@299 257 for sequence in self.sequences:
me@299 258 row = util.UserList()
me@299 259 row.sequence = sequence
me@299 260 for column in self.columns:
me@299 261 row.append(column.get(sequence))
me@299 262 rows.append(row)
me@299 263 return rows
me@299 264
me@299 265 def columns_as_lists(self):
me@299 266 """Return list of columns (temorary objects) in alignment.
me@299 267
me@299 268 Each column here is a list of either monomer or None (for gaps).
me@299 269
me@299 270 Items of column are sorted in the same way as alignment.sequences.
me@299 271
me@299 272 Modifications of column have no effect on the alignment.
me@299 273 """
me@299 274 columns = []
me@299 275 for column in self.columns:
me@299 276 col = []
me@299 277 for sequence in self.sequences:
me@299 278 col.append(column.get(sequence))
me@299 279 columns.append(col)
me@299 280 return columns
me@299 281
me@368 282 # Alignment / Block editing methods
me@368 283 # =================================
me@368 284
me@368 285 def _flush_row(self, row, whence='left'):
me@368 286 """Helper for `flush`: flush to one side all monomers in one row."""
me@368 287 row = filter(None, row)
me@368 288 padding = [None] * len(self.columns)
me@368 289 if whence == 'left':
me@368 290 return row + padding
me@368 291 if whence == 'right':
me@368 292 return padding + row
me@368 293 if whence == 'center':
me@368 294 pad_len = (len(self.columns) - len(row)) // 2
me@368 295 # vvv fix padding for case when length is odd: better have more
me@368 296 pad_len += len(self.columns) - 2 * pad_len
me@368 297 padding = [None] * pad_len
me@368 298 return padding + row + padding
me@368 299 assert True, "whence must be either 'left' or 'right' or 'center'"
me@368 300
me@368 301 def flush(self, whence='left'):
me@368 302 """Remove all gaps from alignment and flush results to one side.
me@368 303
me@368 304 `whence` must be one of 'left', 'right' or 'center'
me@368 305 """
me@368 306 for row in self.rows_as_lists():
me@368 307 sequence = row.sequence
me@368 308 row = self._flush_row(row, whence)
me@368 309 for monomer, column in zip(row, self.columns):
me@368 310 if monomer:
me@368 311 column[sequence] = monomer
me@368 312 elif sequence in column:
me@368 313 del column[sequence]
me@368 314
me@369 315 def remove_gap_columns(self):
me@369 316 """Remove all empty columns."""
me@369 317 for n, column in reversed(enumerate(self.columns)):
me@369 318 if column == {}:
me@369 319 self.columns[n:n+1] = []
me@369 320
me@371 321 def _wipe(self):
me@371 322 """Make all positions gaps (but keep sequences intact)."""
me@371 323 for column in self.columns:
bnagaev@378 324 for sequence in list(column.keys()):
me@371 325 del column[sequence]
me@371 326
me@372 327 def _merge(self, dst, new, merge):
me@373 328 """Replace contents of `dst` with those of `new`.
me@372 329
me@372 330 Replace contents of elements using function `merge(dst_el, new_le)`.
me@372 331 """
dendik@381 332 map(merge, zip(dst, new))
me@372 333 dst[len(dst):] = new[len(dst):]
me@372 334 del dst[len(new):]
me@371 335
me@373 336 def _replace_sequence_contents(self, new, copy_descriptions):
me@373 337 """Replace contents of sequences with those of `new` alignment."""
me@371 338 # XXX: we manually copy sequence contents here
me@372 339 # XXX: we only copy, overlapping parts and link to the rest
me@372 340 def merge_monomers(dst, new):
me@372 341 dst.__class__ = new.__class__
me@372 342 def merge_sequences(dst, new):
me@373 343 if copy_descriptions:
me@373 344 vars(dst).update(vars(new))
me@372 345 self._merge(dst, new, merge_monomers)
me@372 346 self._merge(self.sequences, new.sequences, merge_sequences)
me@371 347
me@371 348 def _replace_column_contents(self, new):
me@373 349 """Replace column contents with those of `new` alignment.
me@371 350
me@373 351 Synonym: copy gap patterns from `new` to `self`.
me@372 352
me@373 353 `self.sequences` and `new.sequences` should have the same contents.
me@371 354 """
me@371 355 self._wipe()
me@371 356 not_gap = lambda (a,b): a != None
me@371 357 for sequence, new_row in zip(self.sequences, new.rows_as_lists()):
me@371 358 assert len(sequence) == len(new_row.sequence)
me@371 359 zipped = zip(sequence, filter(not_gap, enumerate(new_row)))
me@371 360 for monomer, (i, _) in zipped:
me@371 361 self._column_at(i)[sequence] = monomer
me@371 362
me@373 363 def _replace_contents(self, new, copy_descriptions, copy_contents):
me@371 364 """Replace alignment contents with those of other alignment."""
me@373 365 if copy_contents:
me@373 366 self._replace_sequence_contents(new, copy_descriptions)
bnagaev@378 367 self._replace_column_contents(new)
me@371 368
me@373 369 def process(self, function, copy_descriptions=True, copy_contents=True):
me@371 370 """Apply function to the alignment (or block); inject results back.
me@371 371
me@373 372 - `function(block)` must return block with same line order.
me@373 373 - if `copy_descriptions` is False, ignore new sequence names.
me@373 374 - if `copy_contents` is False, don't copy sequence contents too.
dendik@380 375
dendik@380 376 `function` (object) may have attributes `copy_descriptions` and
dendik@380 377 `copy_contents`, which override the same named arguments.
me@371 378 """
me@371 379 new = function(self)
dendik@380 380 if hasattr(function, 'copy_descriptions'):
dendik@380 381 copy_descriptions = function.copy_descriptions
dendik@380 382 if hasattr(function, 'copy_contents'):
dendik@380 383 copy_contents = function.copy_contents
me@373 384 self._replace_contents(new, copy_descriptions, copy_contents)
me@371 385
me@300 386 class Column(dict):
me@300 387 """Column of alignment.
me@300 388
me@300 389 Column is a dict of { sequence : monomer }.
me@300 390
me@300 391 For sequences that have gaps in current row, given key is not present in
me@300 392 the column.
me@300 393 """
me@325 394
me@325 395 def __hash__(self):
me@325 396 """Return hash by identity."""
me@325 397 return id(self)
me@300 398
me@317 399 class Block(Alignment):
me@307 400 """Block of alignment.
me@301 401
me@307 402 Block is intersection of a set of columns & a set of rows. Most of blocks
me@307 403 look like rectangular part of alignment if you shuffle alignment rows the
me@307 404 right way.
me@261 405 """
me@270 406
me@307 407 alignment = None
me@307 408 """Alignment the block belongs to."""
me@270 409
me@307 410 sequences = ()
me@307 411 """List of sequences in block."""
me@307 412
me@307 413 columns = ()
me@307 414 """List of columns in block."""
me@307 415
me@317 416 @classmethod
me@317 417 def from_alignment(cls, alignment, sequences=None, columns=None):
me@307 418 """Build new block from alignment.
me@307 419
me@307 420 If sequences are not given, the block uses all sequences in alignment.
me@307 421
me@307 422 If columns are not given, the block uses all columns in alignment.
me@307 423
me@307 424 In both cases we use exactly the list used in alignment, thus, if new
me@307 425 sequences or columns are added to alignment, the block tracks this too.
me@261 426 """
me@307 427 if sequences is None:
me@307 428 sequences = alignment.sequences
me@318 429 if columns is None:
me@307 430 columns = alignment.columns
me@320 431 block = cls()
me@320 432 block.alignment = alignment
me@320 433 block.sequences = sequences
me@320 434 block.columns = columns
me@320 435 return block
me@270 436
dendik@379 437 Alignment.alignment_type = Alignment
dendik@379 438 Alignment.block_type = Block
dendik@379 439
me@260 440 # vim: set ts=4 sts=4 sw=4 et: