allpy: 2b3cad50c2b1 allpy/base.py

Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/2b3cad50c2b1/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 20:41:16 2013
Кодировка:

allpy: 2b3cad50c2b1 allpy/base.py

allpy

view allpy/base.py @ 1106:2b3cad50c2b1

Partially reversed [afed1f] (see #49) As explained in the ticket, in real life usecases having a monomer belong to several sequences is sometimes extremely useful. ANY approach to attribution of monomer to only one sequence will be either confusing or hindering. * Removed `monomer.sequence` attribute * Removed unncecessary specialcasing in pickle * Removed unused tests * Restored APIs to backward-compatible * Added deprecated messages to the restored APIs

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Sun, 10 Jun 2012 16:08:47 +0400
parents	41a167bbf150
children	79978caa35ee

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = ("-", ".", "~")

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 if code3 not in cls.by_code3 or not is_modified:

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 if code3 not in Monomer.by_code3 or not is_modified:

61 Monomer.by_code3[code3] = TheMonomer

62 Monomer.by_name[name] = TheMonomer

64 @classmethod

65 def _initialize(cls, codes=None):

66 """Create all relevant subclasses of Monomer."""

67 for code1, is_modified, code3, name in codes:

68 cls._subclass(name, code1, code3, is_modified)

70 @classmethod

71 def from_code1(cls, code1):

72 """Create new monomer from 1-letter code."""

73 deprecated(

74 "Monomer.from_code1(...) is deprecated in favor of Sequence.append_monomer(code1=...)"

75 )

76 monomer = cls.by_code1[code1.upper()]()

77 monomer.input_code1 = code1

78 return monomer

80 @classmethod

81 def from_code3(cls, code3):

82 """Create new monomer from 3-letter code."""

83 deprecated(

84 "Monomer.from_code3(...) is deprecated in favor of Sequence.append_monomer(code3=...)"

85 )

86 return cls.by_code3[code3.upper()]()

88 @classmethod

89 def from_name(cls, name):

90 """Create new monomer from full name."""

91 deprecated(

92 "Monomer.from_name(...) is deprecated in favor of Sequence.append_monomer(name=...)"

93 )

94 return cls.by_name[name.strip().capitalize()]()

96 def __repr__(self):

97 return "<Monomer %s>" % str(self.code1)

99 def __str__(self):

100 """Returns one-letter code"""

101 return self.code1

103 def __eq__(self, other):

104 """Monomers within same monomer type are compared by code1."""

105 if not other:

106 return False

107 assert self.type == other.type

108 return self.code1 == other.code1

110 def __ne__(self, other):

111 return not (self == other)

113 class MarkupContainerMixin(object):

114 """Common functions for alignment and sequence for dealing with markups.

115 """

117 def _init(self):

118 """Hook to be called from __init__ of actual class."""

119 self.markups = {}

121 def add_markup(self, name, markup_class=None, use_existing=False, **kws):

122 """Create a markup object, add to self. Return the created markup.

124 - `name` is name for markup in `self.markups` dictionary

125 - optional `markup_class` is class for created markup

126 - if optional `use_existing` is true, it is no error, if same named

127 markup already exists (in this case, nothing is changed)

128 - optional keyword arguments are passed on to the markup constructor

130 For user markups you have to specify `name` and `markup_class`,

131 for the standard automatical markups just `name` is enough.

132 """

133 # We have to import markups here, and not in the module header

134 # so as not to create bad import loops.

135 # `base` module is used extensively in `markups` for inherinance,

136 # so breaking the loop here seems a lot easier.

137 import markups

138 if markup_class is None:

139 kind = self.kind + "_" + "markup"

140 markup_class = markups.by_name[kind, name]

141 if use_existing and name in self.markups:

142 assert self.markups[name].__class__ is markup_class

143 return self.markups[name]

144 assert name not in self.markups

145 markup = markup_class(self, name, caller='container', **kws)

146 self.markups[name] = markup

147 return markup

149 def remove_markup(self, name):

150 """Remove markup."""

151 self.markups[name].remove()

152 del self.markups[name]

154 class Sequence(list, MarkupContainerMixin):

155 """Sequence of Monomers.

157 This behaves like list of monomer objects. In addition to standard list

158 behaviour, Sequence has the following attributes:

160 * name -- str with the name of the sequence

161 * description -- str with description of the sequence

162 * source -- str denoting source of the sequence

164 Any of them may be empty (i.e. hold empty string)

165 """

167 types = base

168 """Mapping of related types. SHOULD be redefined in subclasses."""

170 kind = 'sequence'

171 """Description of object kind."""

173 name = ''

174 """Squence identifier."""

176 description = ''

177 """Detailed sequence description."""

179 source = ''

180 """Sequence source."""

182 def __init__(self, sequence=(), name='', description='', source=''):

183 list.__init__(self, sequence)

184 MarkupContainerMixin._init(self)

186 self.name = name

187 self.description = description

188 self.source = source

190 def append_monomer(self, code1=None, code3=None, name=None):

191 """Append a new monomer to the sequence. Return the new monomer."""

192 assert bool(code1) + bool(code3) + bool(name) == 1, \

193 "Please specify exactly one of: code1, code3, name"

194 if code1:

195 cls = self.types.Monomer.by_code1[code1.upper()]

196 elif code3:

197 cls = self.types.Monomer.by_code3[code3.upper()]

198 elif name:

199 cls = self.types.Monomer.by_name[name.strip().capitalize()]

200 monomer = cls()

201 monomer.input_code1 = code1

202 self.append(monomer)

203 return monomer

205 @classmethod

206 def from_monomers(cls, monomers=[], name='', description='', source=''):

207 """Create sequence from a list of monomer objecst."""

208 deprecated(

209 "Sequence.from_monomers(...) is deprecated in favor of Sequence(...)"

210 )

211 return cls(monomers, name, description, source)

213 @classmethod

214 def from_string(cls, string, name='', description='', source=''):

215 """Create sequences from string of one-letter codes."""

216 self = cls([], name=name, description=description, source=source)

217 for letter in string:

218 self.append_monomer(code1=letter)

219 return self

221 def __repr__(self):

222 if self.name:

223 return '<Sequence %s>' % str(self.name)

224 else:

225 return '<Sequence %s>' % str(self)

227 def __str__(self):

228 """Returns sequence of one-letter codes."""

229 return ''.join(monomer.code1 for monomer in self)

231 def __hash__(self):

232 """Hash sequence by identity."""

233 return id(self)

235 class Alignment(MarkupContainerMixin):

236 """Alignment. It is a list of Columns."""

238 types = base

239 """Mapping of related types. SHOULD be redefined in subclasses."""

241 sequences = None

242 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

244 kind = 'alignment'

245 """Description of object kind."""

247 def __init__(self):

248 """Initialize empty alignment."""

249 self.sequences = []

250 self.columns = []

251 MarkupContainerMixin._init(self)

253 # Alignment grow & IO methods

254 # ==============================

256 def append_sequence(self, sequence):

257 """Add sequence to alignment. Return self.

259 If sequence is too short, pad it with gaps on the right.

260 """

261 self.sequences.append(sequence)

262 self._pad_to_width(len(sequence))

263 for column, monomer in zip(self.columns, sequence):

264 column[sequence] = monomer

265 return self

267 def append_row_from_string(self, string,

268 name='', description='', source='', gaps=default_gaps):

269 """Add row from a string of one-letter codes and gaps. Return self."""

270 Sequence = self.types.Sequence

271 without_gaps = util.remove_each(string, gaps)

272 sequence = Sequence.from_string(without_gaps, name, description, source)

273 self._pad_to_width(len(string))

274 non_gap_columns = [column

275 for column, char in zip(self.columns, string)

276 if char not in gaps

277 ]

278 for monomer, column in zip(sequence, non_gap_columns):

279 column[sequence] = monomer

280 self.sequences.append(sequence)

281 return self

283 def append_row_with_gaps(self, row, sequence):

284 """Add row from row_as_list representation and sequence. Return self."""

285 self.sequences.append(sequence)

286 self._pad_to_width(len(row))

287 for column, monomer in zip(self.columns, row):

288 if monomer:

289 column[sequence] = monomer

290 return self

292 def _append_columns(self, n, columns):

293 """Insert list of `columns` after position `n`."""

294 self.columns[n+1:n+1] = columns

296 def _pad_to_width(self, n):

297 """Pad alignment with empty columns on the right to width n."""

298 columns = [self.types.Column() for _ in range(len(self.columns), n)]

299 self._append_columns(len(self.columns)-1, columns)

301 def append_file(self, file, format='fasta', gaps=default_gaps):

302 """Append sequences from file to alignment. Return self.

304 If sequences in file have gaps (detected as characters belonging to

305 `gaps` set), treat them accordingly.

306 """

307 fileio.File(file, format, gaps=gaps).read_alignment(self)

308 return self

310 def to_file(self, file, format='fasta', gap='-'):

311 """Write alignment in FASTA file as sequences with gaps."""

312 fileio.File(file, format, gaps=gap).write_alignment(self)

313 return self

315 # Data access methods for alignment

316 # =================================

318 def rows(self):

319 """Return list of rows (temporary objects) in alignment.

321 Each row is a dictionary of { column : monomer }.

323 For gap positions there is no key for the column in row.

325 Each row has attribute `sequence` pointing to the sequence the row is

326 describing.

328 Modifications of row have no effect on the alignment.

329 """

330 # For now, the function returns a list rather than iterator.

331 # It is yet to see, whether memory performance here becomes critical,

332 # or is random access useful.

333 rows = []

334 for sequence in self.sequences:

335 row = util.UserDict()

336 row.sequence = sequence

337 for column in self.columns:

338 if sequence in column:

339 row[column] = column[sequence]

340 rows.append(row)

341 return rows

343 def rows_as_lists(self):

344 """Return list of rows (temporary objects) in alignment.

346 Each row here is a list of either monomer or None (for gaps).

348 Each row has attribute `sequence` pointing to the sequence of row.

350 Modifications of row have no effect on the alignment.

351 """

352 rows = []

353 for sequence in self.sequences:

354 row = util.UserList()

355 row.sequence = sequence

356 for column in self.columns:

357 row.append(column.get(sequence))

358 rows.append(row)

359 return rows

361 def rows_as_strings(self, gap='-'):

362 """Return list of string representation of rows in alignment.

364 Each row has attribute `sequence` pointing to the sequence of row.

366 `gap` is the symbol to use for gap.

367 """

368 rows = []

369 for sequence in self.sequences:

370 string = ""

371 for column in self.columns:

372 if sequence in column:

373 string += column[sequence].code1

374 else:

375 string += gap

376 string = util.UserString(string)

377 string.sequence = sequence

378 rows.append(string)

379 return rows

381 def row_as_list(self, sequence):

382 """Return representaion of row as list with `Monomers` and `None`s."""

383 return [column.get(sequence) for column in self.columns]

385 def row_as_string(self, sequence, gap='-'):

386 """Return string representaion of row in alignment.

388 String will have gaps represented by `gap` symbol (defaults to '-').

389 """

390 def char(monomer):

391 if monomer:

392 return monomer.code1

393 return gap

394 row = self.row_as_list(sequence)

395 return "".join(map(char, row))

397 def columns_as_lists(self):

398 """Return list of columns (temorary objects) in alignment.

400 Each column here is a list of either monomer or None (for gaps).

402 Items of column are sorted in the same way as alignment.sequences.

404 Modifications of column have no effect on the alignment.

405 """

406 columns = []

407 for column in self.columns:

408 col = util.UserList()

409 col.column = column

410 for sequence in self.sequences:

411 col.append(column.get(sequence))

412 columns.append(col)

413 return columns

415 # Alignment / Block editing methods

416 # =================================

418 def flush(self, whence='left'):

419 """Remove all gaps from alignment and flush results to one side.

421 `whence` must be one of 'left', 'right' or 'center'

422 """

423 deprecated(

424 "aln.flush('left') is deprecated in favor of aln.realign(Left())"

425 )

426 if whence == 'left':

427 from processors import Left as Flush

428 elif whence == 'right':

429 from processors import Right as Flush

430 elif whence == 'center':

431 from processors import Center as Flush

432 else:

433 raise AssertionError, "Whence must be left, right or center"

434 self.realign(Flush())

436 def remove_gap_columns(self):

437 """Remove all empty columns."""

438 for n, column in reversed(list(enumerate(self.columns))):

439 if not any(seq in column for seq in self.sequences):

440 self.columns[n:n+1] = []

442 def _wipe_row(self, sequence):

443 """Turn all row positions into gaps (but keep sequences intact)."""

444 for column in self.columns:

445 if sequence in column:

446 del column[sequence]

448 def _replace_column_contents(self, new):

449 """Replace column contents with those of `new` alignment.

451 In other words: copy gap patterns from `new` to `self`.

453 `self.sequences` and `new.sequences` should have the same contents.

454 """

455 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

456 sequence = row.sequence

457 monomers = filter(None, row)

458 assert len(monomers) == len(filter(None, new_row))

459 self._wipe_row(sequence)

460 self._pad_to_width(len(new_row))

461 non_gap_columns = [column

462 for column, monomer in zip(self.columns, new_row)

463 if monomer

464 ]

465 assert len(monomers) == len(non_gap_columns)

466 for monomer, column in zip(monomers, non_gap_columns):

467 column[sequence] = monomer

469 def realign(self, function):

470 """Realign self.

472 * apply function to self to produce a new alignment,

473 * update self to have the same gap patterns as the new alignment.

474 """

475 new = function(self)

476 self._replace_column_contents(new)

478 class Column(dict):

479 """Column of alignment.

481 Column is a dict of { sequence : monomer }.

483 For sequences that have gaps in current row, given key is not present in

484 the column.

485 """

487 types = base

488 """Mapping of related types. SHOULD be redefined in subclasses."""

490 def __hash__(self):

491 """Return hash by identity."""

492 return id(self)

495 class Block(Alignment):

496 """Block of alignment.

498 Block is an intersection of several rows & columns. (The collections of

499 rows and columns are represented as ordered lists, to retain display order

500 of Alignment or add ability to tweak it). Most of blocks look like

501 rectangular part of alignment if you shuffle alignment rows the right way.

502 """

504 alignment = None

505 """Alignment the block belongs to."""

507 sequences = ()

508 """List of sequences in block."""

510 columns = ()

511 """List of columns in block."""

513 @classmethod

514 def from_alignment(cls, alignment, sequences=None, columns=None):

515 """Build new block from alignment.

517 If sequences are not given, the block uses all sequences in alignment.

519 If columns are not given, the block uses all columns in alignment.

521 In both cases we use exactly the list used in alignment, thus, if new

522 sequences or columns are added to alignment, the block tracks this too.

523 """

524 if sequences is None:

525 sequences = alignment.sequences

526 if columns is None:

527 columns = alignment.columns

528 block = cls()

529 block.alignment = alignment

530 block.sequences = sequences

531 block.columns = columns

532 return block

534 def _append_columns(self, n, columns):

535 """Insert list of `columns` after position `n`."""

536 assert len(self.columns) != 0, "Can't append columns to an empty Block"

537 target = self.columns[n]

538 for k, column in enumerate(self.alignment.columns):

539 if column is target:

540 me = k

541 if self.columns is not self.alignment.columns:

542 self.alignment._append_columns(me, columns)

543 self.columns[n+1:n+1] = columns

545 class Markup(object):

546 """Base class for sequence and alignment markups.

548 We shall call either sequence or alignment a container. And we shall call

549 either monomers or columns elements respectively.

551 Markup behaves like a dictionary of [element] -> value.

553 Every container has a dictionary of [name] -> markup. It is Markup's

554 responsibility to add itself to this dictionary and to avoid collisions

555 while doing it.

556 """

558 name = None

559 """Name of markup elements."""

561 save = True

562 """If set to false, fileio should not save this markup."""

564 def __init__(self, container, name, **kwargs):

565 """Markup takes mandatory container and name and optional kwargs.

567 Markups should never be created by the user. They are created by

568 Sequence or Alignment.

569 """

570 self.name = name

571 assert kwargs.get('caller') == 'container', "Improper call"

572 self.refresh()

574 def refresh(self):

575 """Recalculate markup values (if they are generated automatically)."""

576 pass

578 def remove(self):

579 """Remove the traces of markup object. Do not call this yourself!"""

580 pass

582 @classmethod

583 def from_record(cls, container, record, name=None):

584 """Restore markup from `record`. (Used for loading from file).

586 `record` is a dict of all metadata and data related to one markup. All

587 keys and values in `record` are strings, markup must parse them itself.

589 Markup values should be stored in `record['markup']`, which is a list

590 of items separated with either `record['separator']` or a comma.

591 """

592 return container.add_markup(name, markup_class=cls)

594 def to_record(self, keys=None):

595 """Save markup to `record`, for saving to file.

597 For description of `record` see docstring for `from_record` method.

599 If `keys` argument is given, restrict output to the given keys.

600 """

601 return {}

603 def sorted_keys(self):

604 """Return list of elements in the container in proper order."""

605 raise NotImplementedError()

607 def sorted_values(self, **kw):

608 """Return list of markup values in container.

610 Possible arguments:

612 - `map` -- a function, applied to each existing value

613 - `default` -- a value to return for non-existing values

615 If `default` is not specified, the function fails on markups that do

616 not have all of the values set.

617 """

618 default_exists = 'default' in kw

619 default = kw.get('default')

620 map = kw.get('map', lambda x: x)

621 for item in self.sorted_keys():

622 if item not in self and default_exists:

623 yield default

624 else:

625 yield map(self[item])

627 class SequenceMarkup(Markup):

628 """Markup for sequence.

630 Behaves like a dictionary of [monomer] -> value. Value may be anything

631 or something specific, depending on subclass.

633 Actual values are stored in monomers themselves as attributes.

634 """

636 kind = 'sequence_markup'

638 def __init__(self, sequence, name, **kwargs):

639 self.sequence = sequence

640 Markup.__init__(self, sequence, name, **kwargs)

642 def remove(self):

643 """Remove the traces of markup object. Do not call this yourself!"""

644 for monomer in self.sequence:

645 del self[monomer]

647 def sorted_keys(self):

648 """Return list of monomers."""

649 return self.sequence

651 def get(self, key, value=None):

652 """Part of Mapping collection interface."""

653 if key not in self:

654 return value

655 return self[key]

657 def __contains__(self, monomer):

658 """Part of Mapping collection interface."""

659 return hasattr(monomer, self.name)

661 def __getitem__(self, monomer):

662 """Part of Mapping collection interface."""

663 return getattr(monomer, self.name)

665 def __setitem__(self, monomer, value):

666 """Part of Mapping collection interface."""

667 return setattr(monomer, self.name, value)

669 def __delitem__(self, monomer):

670 """Part of Mapping collection interface."""

671 return delattr(monomer, self.name)

673 class AlignmentMarkup(dict, Markup):

674 """Markupf for alignment.

676 Is a dictionary of [column] -> value. Value may be anything or something

677 specific, depending on subclass.

678 """

680 kind = 'alignment_markup'

682 def __init__(self, alignment, name, **kwargs):

683 self.alignment = alignment

684 Markup.__init__(self, alignment, name, **kwargs)

686 def sorted_keys(self):

687 """Return a list of columns."""

688 return self.alignment.columns

690 # vim: set ts=4 sts=4 sw=4 et: