allpy: d87129162eb4 allpy/base.py

Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/d87129162eb4/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:53 2013
Кодировка:

allpy: d87129162eb4 allpy/base.py

allpy

view allpy/base.py @ 822:d87129162eb4

Implemented & tested new markup API. See #95 1) Sequences, Alignment and Blocks now have two new methods: - add_markup(name, markup_class=optional, **kwargs=optional) - remove_markup(name) name refers to the same name as in aln.markups[name] or sequence[i].name It is now explicitly denied to create markups any other way. 2) Markups now have `remove()` method that means 'release all memory that would not be released otherwised, if we just remove markup from the dictionary'. For sequences markups it removes markup attribute from each monomer. 3) Added necessary del sequence_markup[monomer] method. 4) Many base classes have attribute `kind`; for Alignments and Blocks it is 'alignment', for Sequences it is 'sequence' for AlignmentMarkups it is 'alignment_markup' for SequenceMarkups it is 'sequence_markup'. This attribute is crucial for new alignment construction API. 5) Common stuff for MarkupContainers (Alignments and Sequences) is in MarkupContainerMixin.

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Fri, 15 Jul 2011 16:43:03 +0400
parents	91e73fb1ac79
children	0192c5c09ce8

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 if code3 not in cls.by_code3 or not is_modified:

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 if code3 not in Monomer.by_code3 or not is_modified:

61 Monomer.by_code3[code3] = TheMonomer

62 Monomer.by_name[name] = TheMonomer

64 @classmethod

65 def _initialize(cls, codes=None):

66 """Create all relevant subclasses of Monomer."""

67 for code1, is_modified, code3, name in codes:

68 cls._subclass(name, code1, code3, is_modified)

70 @classmethod

71 def from_code1(cls, code1):

72 """Create new monomer from 1-letter code."""

73 monomer = cls.by_code1[code1.upper()]()

74 monomer.input_code1 = code1

75 return monomer

77 @classmethod

78 def from_code3(cls, code3):

79 """Create new monomer from 3-letter code."""

80 return cls.by_code3[code3.upper()]()

82 @classmethod

83 def from_name(cls, name):

84 """Create new monomer from full name."""

85 return cls.by_name[name.strip().capitalize()]()

87 def __repr__(self):

88 return "<Monomer %s>" % str(self.code1)

90 def __str__(self):

91 """Returns one-letter code"""

92 return self.code1

94 def __eq__(self, other):

95 """Monomers within same monomer type are compared by code1."""

96 if not other:

97 return False

98 assert self.type == other.type

99 return self.code1 == other.code1

101 def __ne__(self, other):

102 return not (self == other)

104 class MarkupContainerMixin(object):

105 """Common functions for alignment and sequence for dealing with markups.

106 """

108 def _init(self):

109 """Hook to be called from __init__ of actual class."""

110 self.markups = {}

112 def add_markup(self, name, markup_class=None, **markup_kwargs):

113 """Create a markup object, add to self. Return the created markup.

115 - `name` is name for markup in `self.markups` dictionary

116 - optional `markup_class` is class for created markup

117 - optional keyword arguments are passed on to the markup constructor

119 For user markups you have to specify `name` and `markup_class`,

120 for the standard automatical markups just `name` is enough.

121 """

122 # We have to import markups here, and not in the module header

123 # so as not to create bad import loops.

124 # `base` module is used extensively in `markups` for inherinance,

125 # so breaking the loop here seems a lot easier.

126 import markups

127 if markup_class is None:

128 kind = self.kind + "_" + "markup"

129 markup_class = markups.by_name[kind, name]

130 assert name not in self.markups

131 markup = markup_class(self, name, caller='container', **markup_kwargs)

132 self.markups[name] = markup

133 return markup

135 def remove_markup(self, name):

136 """Remove markup."""

137 self.markups[name].remove()

138 del self.markups[name]

140 class Sequence(list, MarkupContainerMixin):

141 """Sequence of Monomers.

143 This behaves like list of monomer objects. In addition to standard list

144 behaviour, Sequence has the following attributes:

146 * name -- str with the name of the sequence

147 * description -- str with description of the sequence

148 * source -- str denoting source of the sequence

150 Any of them may be empty (i.e. hold empty string)

151 """

153 types = base

154 """Mapping of related types. SHOULD be redefined in subclasses."""

156 kind = 'sequence'

157 """Description of object kind."""

159 name = ''

160 description = ''

161 source = ''

163 def __init__(self, *args):

164 list.__init__(self, *args)

165 MarkupContainerMixin._init(self)

167 @classmethod

168 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

169 """Create sequence from a list of monomer objecst."""

170 result = cls(monomers)

171 if name:

172 result.name = name

173 if description:

174 result.description = description

175 if source:

176 result.source = source

177 return result

179 @classmethod

180 def from_string(cls, string, name='', description='', source=''):

181 """Create sequences from string of one-letter codes."""

182 monomer = cls.types.Monomer.from_code1

183 monomers = [monomer(letter) for letter in string]

184 return cls.from_monomers(monomers, name, description, source)

186 def __repr__(self):

187 if self.name:

188 return '<Sequence %s>' % str(self.name)

189 else:

190 return '<Sequence %s>' % str(self)

192 def __str__(self):

193 """Returns sequence of one-letter codes."""

194 return ''.join(monomer.code1 for monomer in self)

196 def __hash__(self):

197 """Hash sequence by identity."""

198 return id(self)

200 class Alignment(MarkupContainerMixin):

201 """Alignment. It is a list of Columns."""

203 types = base

204 """Mapping of related types. SHOULD be redefined in subclasses."""

206 sequences = None

207 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

209 kind = 'alignment'

210 """Description of object kind."""

212 def __init__(self):

213 """Initialize empty alignment."""

214 self.sequences = []

215 self.columns = []

216 MarkupContainerMixin._init(self)

218 # Alignment grow & IO methods

219 # ==============================

221 def append_sequence(self, sequence):

222 """Add sequence to alignment. Return self.

224 If sequence is too short, pad it with gaps on the right.

225 """

226 self.sequences.append(sequence)

227 self._pad_to_width(len(sequence))

228 for column, monomer in zip(self.columns, sequence):

229 column[sequence] = monomer

230 return self

232 def append_row_from_string(self, string,

233 name='', description='', source='', gaps=default_gaps):

234 """Add row from a string of one-letter codes and gaps. Return self."""

235 Sequence = self.types.Sequence

236 without_gaps = util.remove_each(string, gaps)

237 sequence = Sequence.from_string(without_gaps, name, description, source)

238 self._pad_to_width(len(string))

239 non_gap_columns = [column

240 for column, char in zip(self.columns, string)

241 if char not in gaps

242 ]

243 for monomer, column in zip(sequence, non_gap_columns):

244 column[sequence] = monomer

245 self.sequences.append(sequence)

246 return self

248 def append_row_with_gaps(self, row, sequence):

249 """Add row from row_as_list representation and sequence. Return self."""

250 self.sequences.append(sequence)

251 self._pad_to_width(len(row))

252 for column, monomer in zip(self.columns, row):

253 if monomer:

254 column[sequence] = monomer

255 return self

257 def _pad_to_width(self, n):

258 """Pad alignment with empty columns on the right to width n."""

259 for i in range(len(self.columns), n):

260 self.columns.append(Column())

262 def append_file(self, file, format='fasta', gaps=default_gaps):

263 """Append sequences from file to alignment. Return self.

265 If sequences in file have gaps (detected as characters belonging to

266 `gaps` set), treat them accordingly.

267 """

268 fileio.File(file, format, gaps=gaps).read_alignment(self)

269 return self

271 def to_file(self, file, format='fasta', gap='-'):

272 """Write alignment in FASTA file as sequences with gaps."""

273 fileio.File(file, format, gaps=gap).write_alignment(self)

274 return self

276 # Data access methods for alignment

277 # =================================

279 def rows(self):

280 """Return list of rows (temporary objects) in alignment.

282 Each row is a dictionary of { column : monomer }.

284 For gap positions there is no key for the column in row.

286 Each row has attribute `sequence` pointing to the sequence the row is

287 describing.

289 Modifications of row have no effect on the alignment.

290 """

291 # For now, the function returns a list rather than iterator.

292 # It is yet to see, whether memory performance here becomes critical,

293 # or is random access useful.

294 rows = []

295 for sequence in self.sequences:

296 row = util.UserDict()

297 row.sequence = sequence

298 for column in self.columns:

299 if sequence in column:

300 row[column] = column[sequence]

301 rows.append(row)

302 return rows

304 def rows_as_lists(self):

305 """Return list of rows (temporary objects) in alignment.

307 Each row here is a list of either monomer or None (for gaps).

309 Each row has attribute `sequence` pointing to the sequence of row.

311 Modifications of row have no effect on the alignment.

312 """

313 rows = []

314 for sequence in self.sequences:

315 row = util.UserList()

316 row.sequence = sequence

317 for column in self.columns:

318 row.append(column.get(sequence))

319 rows.append(row)

320 return rows

322 def rows_as_strings(self, gap='-'):

323 """Return list of string representation of rows in alignment.

325 Each row has attribute `sequence` pointing to the sequence of row.

327 `gap` is the symbol to use for gap.

328 """

329 rows = []

330 for sequence in self.sequences:

331 string = ""

332 for column in self.columns:

333 if sequence in column:

334 string += column[sequence].code1

335 else:

336 string += gap

337 string = util.UserString(string)

338 string.sequence = sequence

339 rows.append(string)

340 return rows

342 def row_as_list(self, sequence):

343 """Return representaion of row as list with `Monomers` and `None`s."""

344 return [column.get(sequence) for column in self.columns]

346 def row_as_string(self, sequence, gap='-'):

347 """Return string representaion of row in alignment.

349 String will have gaps represented by `gap` symbol (defaults to '-').

350 """

351 def char(monomer):

352 if monomer:

353 return monomer.code1

354 return gap

355 row = self.row_as_list(sequence)

356 return "".join(map(char, row))

358 def columns_as_lists(self):

359 """Return list of columns (temorary objects) in alignment.

361 Each column here is a list of either monomer or None (for gaps).

363 Items of column are sorted in the same way as alignment.sequences.

365 Modifications of column have no effect on the alignment.

366 """

367 columns = []

368 for column in self.columns:

369 col = util.UserList()

370 col.column = column

371 for sequence in self.sequences:

372 col.append(column.get(sequence))

373 columns.append(col)

374 return columns

376 # Alignment / Block editing methods

377 # =================================

379 def flush(self, whence='left'):

380 """Remove all gaps from alignment and flush results to one side.

382 `whence` must be one of 'left', 'right' or 'center'

383 """

384 if whence == 'left':

385 from processors import Left as Flush

386 elif whence == 'right':

387 from processors import Right as Flush

388 elif whence == 'center':

389 from processors import Center as Flush

390 else:

391 raise AssertionError, "Whence must be left, right or center"

392 self.realign(Flush())

394 def remove_gap_columns(self):

395 """Remove all empty columns."""

396 for n, column in reversed(list(enumerate(self.columns))):

397 if column == {}:

398 self.columns[n:n+1] = []

400 def _wipe_row(self, sequence):

401 """Turn all row positions into gaps (but keep sequences intact)."""

402 for column in self.columns:

403 if sequence in column:

404 del column[sequence]

406 def _merge(self, dst, new, merge):

407 """Replace contents of `dst` with those of `new`.

409 Replace contents of elements using function `merge(dst_el, new_le)`.

410 """

411 for el, new_el in zip(dst, new):

412 merge(el, new_el)

413 dst[len(dst):] = new[len(dst):]

414 del dst[len(new):]

416 def _replace_sequence_contents(self, new, copy_descriptions):

417 """Replace contents of sequences with those of `new` alignment."""

418 # XXX: we manually copy sequence contents here

419 # XXX: we only copy, overlapping parts and link to the rest

420 def merge_monomers(dst, new):

421 dst.__class__ = new.__class__

422 def merge_sequences(dst, new):

423 if copy_descriptions:

424 vars(dst).update(vars(new))

425 self._merge(dst, new, merge_monomers)

426 self._merge(self.sequences, new.sequences, merge_sequences)

428 def _replace_column_contents(self, new):

429 """Replace column contents with those of `new` alignment.

431 In other words: copy gap patterns from `new` to `self`.

433 `self.sequences` and `new.sequences` should have the same contents.

434 """

435 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

436 sequence = row.sequence

437 monomers = filter(None, row)

438 assert len(monomers) == len(filter(None, new_row))

439 self._wipe_row(sequence)

440 non_gap_columns = [column

441 for column, monomer in zip(self.columns, new_row)

442 if monomer

443 ]

444 for monomer, column in zip(monomers, non_gap_columns):

445 column[sequence] = monomer

447 def _replace_contents(self, new, copy_descriptions, copy_contents):

448 """Replace alignment contents with those of other alignment."""

449 if copy_contents:

450 self._replace_sequence_contents(new, copy_descriptions)

451 self._replace_column_contents(new)

453 def process(self, function, copy_descriptions=True, copy_contents=True):

454 """Apply function to the alignment (or block); inject results back.

456 - `function(block)` must return block with same line order.

457 - if `copy_descriptions` is False, ignore new sequence names.

458 - if `copy_contents` is False, don't copy sequence contents too.

460 `function` (object) may have attributes `copy_descriptions` and

461 `copy_contents`, which override the same named arguments.

462 """

463 new = function(self)

464 if hasattr(function, 'copy_descriptions'):

465 copy_descriptions = function.copy_descriptions

466 if hasattr(function, 'copy_contents'):

467 copy_contents = function.copy_contents

468 self._replace_contents(new, copy_descriptions, copy_contents)

470 def realign(self, function):

471 """Realign self.

473 I.e.: apply function to self to produce a new alignment, then update

474 self to have the same gap patterns as the new alignment.

476 This is the same as process(function, False, False)

477 """

478 new = function(self)

479 self._replace_column_contents(new)

481 class Column(dict):

482 """Column of alignment.

484 Column is a dict of { sequence : monomer }.

486 For sequences that have gaps in current row, given key is not present in

487 the column.

488 """

490 types = base

491 """Mapping of related types. SHOULD be redefined in subclasses."""

493 def __hash__(self):

494 """Return hash by identity."""

495 return id(self)

497 class Block(Alignment):

498 """Block of alignment.

500 Block is an intersection of several rows & columns. (The collections of

501 rows and columns are represented as ordered lists, to retain display order

502 of Alignment or add ability to tweak it). Most of blocks look like

503 rectangular part of alignment if you shuffle alignment rows the right way.

504 """

506 alignment = None

507 """Alignment the block belongs to."""

509 sequences = ()

510 """List of sequences in block."""

512 columns = ()

513 """List of columns in block."""

515 @classmethod

516 def from_alignment(cls, alignment, sequences=None, columns=None):

517 """Build new block from alignment.

519 If sequences are not given, the block uses all sequences in alignment.

521 If columns are not given, the block uses all columns in alignment.

523 In both cases we use exactly the list used in alignment, thus, if new

524 sequences or columns are added to alignment, the block tracks this too.

525 """

526 if sequences is None:

527 sequences = alignment.sequences

528 if columns is None:

529 columns = alignment.columns

530 block = cls()

531 block.alignment = alignment

532 block.sequences = sequences

533 block.columns = columns

534 return block

536 class Markup(object):

537 """Base class for sequence and alignment markups.

539 We shall call either sequence or alignment a container. And we shall call

540 either monomers or columns elements respectively.

542 Markup behaves like a dictionary of [element] -> value.

544 Every container has a dictionary of [name] -> markup. It is Markup's

545 responsibility to add itself to this dictionary and to avoid collisions

546 while doing it.

547 """

549 name = None

550 """Name of markup elements"""

552 def __init__(self, container, name, **kwargs):

553 """Markup takes mandatory container and name and optional kwargs.

555 Markups should never be created by the user. They are created by

556 Sequence or Alignment.

557 """

558 self.name = name

559 assert kwargs.get('caller') == 'container', "Improper call"

560 self.refresh()

562 def refresh(self):

563 """Recalculate markup values (if they are generated automatically)."""

564 pass

566 def remove(self):

567 """Remove the traces of markup object. Do not call this yourself!"""

568 pass

570 @classmethod

571 def from_record(cls, container, record, name=None):

572 """Restore markup from `record`. (Used for loading from file).

574 `record` is a dict of all metadata and data related to one markup. All

575 keys and values in `record` are strings, markup must parse them itself.

577 Markup values should be stored in `record['markup']`, which is a list

578 of items separated with either `record['separator']` or a comma.

579 """

580 return container.add_markup(name, markup_class=cls)

582 def to_record(self):

583 """Save markup to `record`, for saving to file.

585 For description of `record` see docstring for `from_record` method.

586 """

587 return {}

589 def sorted_keys(self):

590 """Return list of elements in the container in proper order."""

591 raise NotImplementedError()

593 def sorted_values(self):

594 """Return list of markup values in container."""

595 raise NotImplementedError()

597 class SequenceMarkup(Markup):

598 """Markup for sequence.

600 Behaves like a dictionary of [monomer] -> value. Value may be anything

601 or something specific, depending on subclass.

603 Actual values are stored in monomers themselves as attributes.

604 """

606 kind = 'sequence_markup'

608 def __init__(self, sequence, name, **kwargs):

609 self.sequence = sequence

610 Markup.__init__(self, sequence, name, **kwargs)

612 def remove(self):

613 """Remove the traces of markup object. Do not call this yourself!"""

614 for monomer in self.monomers:

615 del self[monomer]

617 def sorted_keys(self):

618 """Return list of monomers."""

619 return self.sequence

621 def sorted_values(self):

622 """Return list of markup values, if every monomer is marked up."""

623 return (self[monomer] for monomer in self.sequence)

625 def get(self, key, value=None):

626 """Part of Mapping collection interface."""

627 if key not in self:

628 return value

629 return self[key]

631 def __contains__(self, monomer):

632 """Part of Mapping collection interface."""

633 return hasattr(monomer, self.name)

635 def __getitem__(self, monomer):

636 """Part of Mapping collection interface."""

637 return getattr(monomer, self.name)

639 def __setitem__(self, monomer, value):

640 """Part of Mapping collection interface."""

641 return setattr(monomer, self.name, value)

643 def __delitem__(self, monomer):

644 """Part of Mapping collection interface."""

645 return delattr(monomer, self.name)

647 class AlignmentMarkup(dict, Markup):

648 """Markupf for alignment.

650 Is a dictionary of [column] -> value. Value may be anything or something

651 specific, depending on subclass.

652 """

654 kind = 'alignment_markup'

656 def __init__(self, alignment, name, **kwargs):

657 self.alignment = alignment

658 Markup.__init__(self, alignment, name, **kwargs)

660 def sorted_keys(self):

661 """Return a list of columns."""

662 return self.alignment.columns

664 def sorted_values(self):

665 """Return a list of makrup values, if every column is marked up."""

666 return (self[column] for column in self.alignment.columns)

668 # vim: set ts=4 sts=4 sw=4 et: