allpy: 612c618fb7b0 allpy/base.py

allpy

view allpy/base.py @ 646:612c618fb7b0

Changed __repr__ for sequence and monomer in allpy.base. Added row and row_as_string methods to alignment. New faster blocks_finder works, but glitches, though.

author	Boris Burkov <BurkovBA@gmail.com>
date	Tue, 07 Jun 2011 17:37:06 +0400
parents	a1307c0bb030
children	b35116e13f35

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 TheMonomer.__name__ = re.sub(r"\W", "_", name)

41 TheMonomer.__module__ = data.monomers.__name__

42 TheMonomer.name = name

43 TheMonomer.code1 = code1

44 TheMonomer.code3 = code3

45 TheMonomer.is_modified = is_modified

46 # Save the class in data.monomers so that it can be pickled

47 # Some names are not unique, we append underscores to them

48 # in order to fix it.

49 # XXX: this WILL fail with dna 0AV != rna A2M, which both have

50 # name "2'-O-METHYLADENOSINE 5'-(DIHYDROGEN PHOSPHATE)"

51 while TheMonomer.__name__ in vars(data.monomers):

52 TheMonomer.__name__ += "_"

53 vars(data.monomers)[TheMonomer.__name__] = TheMonomer

54 if not is_modified:

55 cls.by_code1[code1] = TheMonomer

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 Monomer.by_code3[code3] = TheMonomer

61 Monomer.by_name[name] = TheMonomer

63 @classmethod

64 def _initialize(cls, codes=None):

65 """Create all relevant subclasses of Monomer."""

66 for code1, is_modified, code3, name in codes:

67 cls._subclass(name, code1, code3, is_modified)

69 @classmethod

70 def from_code1(cls, code1):

71 """Create new monomer from 1-letter code."""

72 return cls.by_code1[code1.upper()]()

74 @classmethod

75 def from_code3(cls, code3):

76 """Create new monomer from 3-letter code."""

77 return cls.by_code3[code3.upper()]()

79 @classmethod

80 def from_name(cls, name):

81 """Create new monomer from full name."""

82 return cls.by_name[name.strip().capitalize()]()

84 def __repr__(self):

85 return str(self.code1)

87 def __str__(self):

88 """Returns one-letter code"""

89 return self.code1

91 def __eq__(self, other):

92 """Monomers within same monomer type are compared by code1."""

93 if not other:

94 return False

95 assert self.type == other.type

96 return self.code1 == other.code1

98 def __ne__(self, other):

99 return not (self == other)

100

101 class Sequence(list):

102 """Sequence of Monomers.

103

104 This behaves like list of monomer objects. In addition to standard list

105 behaviour, Sequence has the following attributes:

106

107 * name -- str with the name of the sequence

108 * description -- str with description of the sequence

109 * source -- str denoting source of the sequence

110

111 Any of them may be empty (i.e. hold empty string)

112 """

113

114 types = base

115 """Mapping of related types. SHOULD be redefined in subclasses."""

116

117 name = ''

118 description = ''

119 source = ''

120

121 @classmethod

122 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

123 """Create sequence from a list of monomer objecst."""

124 result = cls(monomers)

125 if name:

126 result.name = name

127 if description:

128 result.description = description

129 if source:

130 result.source = source

131 return result

132

133 @classmethod

134 def from_string(cls, string, name='', description='', source=''):

135 """Create sequences from string of one-letter codes."""

136 monomer = cls.types.Monomer.from_code1

137 monomers = [monomer(letter) for letter in string]

138 return cls.from_monomers(monomers, name, description, source)

139

140 def __repr__(self):

141 if self.name:

142 return str(self.name)

143 else:

144 return '<Sequence %s>' % str(self)

145

146

147 def __str__(self):

148 """Returns sequence of one-letter codes."""

149 return ''.join(monomer.code1 for monomer in self)

150

151 def __hash__(self):

152 """Hash sequence by identity."""

153 return id(self)

154

155 class Alignment(object):

156 """Alignment. It is a list of Columns."""

157

158 types = base

159 """Mapping of related types. SHOULD be redefined in subclasses."""

160

161 sequences = None

162 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

163

164 def __init__(self):

165 """Initialize empty alignment."""

166 self.sequences = []

167 self.columns = []

168

169 # Alignment grow & IO methods

170 # ==============================

171

172 def append_sequence(self, sequence):

173 """Add sequence to alignment. Return self.

174

175 If sequence is too short, pad it with gaps on the right.

176 """

177 self.sequences.append(sequence)

178 self._pad_to_width(len(sequence))

179 for column, monomer in zip(self.columns, sequence):

180 column[sequence] = monomer

181 return self

182

183 def append_row_from_string(self, string,

184 name='', description='', source='', gaps=default_gaps):

185 """Add row from a string of one-letter codes and gaps. Return self."""

186 Sequence = self.types.Sequence

187 without_gaps = util.remove_each(string, gaps)

188 sequence = Sequence.from_string(without_gaps, name, description, source)

189 self._pad_to_width(len(string))

190 non_gap_columns = [column

191 for column, char in zip(self.columns, string)

192 if char not in gaps

193 ]

194 for monomer, column in zip(sequence, non_gap_columns):

195 column[sequence] = monomer

196 self.sequences.append(sequence)

197 return self

198

199 def append_row_with_gaps(self, row, sequence):

200 """Add row from row_as_list representation and sequence. Return self."""

201 self.sequences.append(sequence)

202 self._pad_to_width(len(row))

203 for column, monomer in zip(self.columns, row):

204 if monomer:

205 column[sequence] = monomer

206 return self

207

208 def _pad_to_width(self, n):

209 """Pad alignment with empty columns on the right to width n."""

210 for i in range(len(self.columns), n):

211 self.columns.append(Column())

212

213 def append_file(self, file, format='fasta', gaps=default_gaps):

214 """Append sequences from file to alignment. Return self.

215

216 If sequences in file have gaps (detected as characters belonging to

217 `gaps` set), treat them accordingly.

218 """

219 sequences = []

220 io = fileio.File(file, format)

221 for name, description, body in io.read_strings():

222 self.append_row_from_string(body, name, description, file.name, gaps)

223 return self

224

225 def to_file(self, file, format='fasta', gap='-'):

226 """Write alignment in FASTA file as sequences with gaps."""

227 strings = [(s, s.sequence.name, s.sequence.description)

228 for s in self.rows_as_strings()]

229 fileio.File(file, format).write_strings(strings)

230

231 # Data access methods for alignment

232 # =================================

233

234 def row(self, sequence):

235 """Creates and returns temporary list of monomers and Nones"""

236 output=[]

237 for column in self.columns:

238 if sequence in column:

239 output.append(column[sequence])

240 else:

241 output.append(None)

242 return output

243

244 def row_as_string(self, sequence):

245 """Creates string of one-letter monomers' codes and gaps ("-")"""

246 def char(monomer):

247 if monomer:

248 return monomer.code1

249 return "-"

250 row = self.row(sequence)

251 list_of_letters = map(char, row)

252 output=""

253 for letter in list_of_letters: output+=letter

254 return output

255

256 def rows(self):

257 """Return list of rows (temporary objects) in alignment.

258

259 Each row is a dictionary of { column : monomer }.

260

261 For gap positions there is no key for the column in row.

262

263 Each row has attribute `sequence` pointing to the sequence the row is

264 describing.

265

266 Modifications of row have no effect on the alignment.

267 """

268 # For now, the function returns a list rather than iterator.

269 # It is yet to see, whether memory performance here becomes critical,

270 # or is random access useful.

271 rows = []

272 for sequence in self.sequences:

273 row = util.UserDict()

274 row.sequence = sequence

275 for column in self.columns:

276 if sequence in column:

277 row[column] = column[sequence]

278 rows.append(row)

279 return rows

280

281 def rows_as_lists(self):

282 """Return list of rows (temporary objects) in alignment.

283

284 Each row here is a list of either monomer or None (for gaps).

285

286 Each row has attribute `sequence` pointing to the sequence of row.

287

288 Modifications of row have no effect on the alignment.

289 """

290 rows = []

291 for sequence in self.sequences:

292 row = util.UserList()

293 row.sequence = sequence

294 for column in self.columns:

295 row.append(column.get(sequence))

296 rows.append(row)

297 return rows

298

299 def rows_as_strings(self, gap='-'):

300 """Return list of string representation of rows in alignment.

301

302 Each row has attribute `sequence` pointing to the sequence of row.

303

304 `gap` is the symbol to use for gap.

305 """

306 rows = []

307 for sequence in self.sequences:

308 string = ""

309 for column in self.columns:

310 if sequence in column:

311 string += column[sequence].code1

312 else:

313 string += gap

314 string = util.UserString(string)

315 string.sequence = sequence

316 rows.append(string)

317 return rows

318

319 def columns_as_lists(self):

320 """Return list of columns (temorary objects) in alignment.

321

322 Each column here is a list of either monomer or None (for gaps).

323

324 Items of column are sorted in the same way as alignment.sequences.

325

326 Modifications of column have no effect on the alignment.

327 """

328 columns = []

329 for column in self.columns:

330 col = []

331 for sequence in self.sequences:

332 col.append(column.get(sequence))

333 columns.append(col)

334 return columns

335

336 # Alignment / Block editing methods

337 # =================================

338

339 def flush(self, whence='left'):

340 """Remove all gaps from alignment and flush results to one side.

341

342 `whence` must be one of 'left', 'right' or 'center'

343 """

344 if whence == 'left':

345 from processors import Left as Flush

346 elif whence == 'right':

347 from processors import Right as Flush

348 elif whence == 'center':

349 from processors import Center as Flush

350 else:

351 raise AssertionError, "Whence must be left, right or center"

352 self.realign(Flush())

353

354 def remove_gap_columns(self):

355 """Remove all empty columns."""

356 for n, column in reversed(list(enumerate(self.columns))):

357 if column == {}:

358 self.columns[n:n+1] = []

359

360 def _wipe_row(self, sequence):

361 """Turn all row positions into gaps (but keep sequences intact)."""

362 for column in self.columns:

363 if sequence in column:

364 del column[sequence]

365

366 def _merge(self, dst, new, merge):

367 """Replace contents of `dst` with those of `new`.

368

369 Replace contents of elements using function `merge(dst_el, new_le)`.

370 """

371 for el, new_el in zip(dst, new):

372 merge(el, new_el)

373 dst[len(dst):] = new[len(dst):]

374 del dst[len(new):]

375

376 def _replace_sequence_contents(self, new, copy_descriptions):

377 """Replace contents of sequences with those of `new` alignment."""

378 # XXX: we manually copy sequence contents here

379 # XXX: we only copy, overlapping parts and link to the rest

380 def merge_monomers(dst, new):

381 dst.__class__ = new.__class__

382 def merge_sequences(dst, new):

383 if copy_descriptions:

384 vars(dst).update(vars(new))

385 self._merge(dst, new, merge_monomers)

386 self._merge(self.sequences, new.sequences, merge_sequences)

387

388 def _replace_column_contents(self, new):

389 """Replace column contents with those of `new` alignment.

390

391 In other words: copy gap patterns from `new` to `self`.

392

393 `self.sequences` and `new.sequences` should have the same contents.

394 """

395 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

396 sequence = row.sequence

397 monomers = filter(None, row)

398 assert len(monomers) == len(filter(None, new_row))

399 self._wipe_row(sequence)

400 non_gap_columns = [column

401 for column, monomer in zip(self.columns, new_row)

402 if monomer

403 ]

404 for monomer, column in zip(monomers, non_gap_columns):

405 column[sequence] = monomer

406

407 def _replace_contents(self, new, copy_descriptions, copy_contents):

408 """Replace alignment contents with those of other alignment."""

409 if copy_contents:

410 self._replace_sequence_contents(new, copy_descriptions)

411 self._replace_column_contents(new)

412

413 def process(self, function, copy_descriptions=True, copy_contents=True):

414 """Apply function to the alignment (or block); inject results back.

415

416 - `function(block)` must return block with same line order.

417 - if `copy_descriptions` is False, ignore new sequence names.

418 - if `copy_contents` is False, don't copy sequence contents too.

419

420 `function` (object) may have attributes `copy_descriptions` and

421 `copy_contents`, which override the same named arguments.

422 """

423 new = function(self)

424 if hasattr(function, 'copy_descriptions'):

425 copy_descriptions = function.copy_descriptions

426 if hasattr(function, 'copy_contents'):

427 copy_contents = function.copy_contents

428 self._replace_contents(new, copy_descriptions, copy_contents)

429

430 def realign(self, function):

431 """Realign self.

432

433 I.e.: apply function to self to produce a new alignment, then update

434 self to have the same gap patterns as the new alignment.

435

436 This is the same as process(function, False, False)

437 """

438 new = function(self)

439 self._replace_column_contents(new)

440

441 class Column(dict):

442 """Column of alignment.

443

444 Column is a dict of { sequence : monomer }.

445

446 For sequences that have gaps in current row, given key is not present in

447 the column.

448 """

449

450 types = base

451 """Mapping of related types. SHOULD be redefined in subclasses."""

452

453 def __hash__(self):

454 """Return hash by identity."""

455 return id(self)

456

457 class Block(Alignment):

458 """Block of alignment.

459

460 Block is an intersection of several rows & columns. (The collections of

461 rows and columns are represented as ordered lists, to retain display order

462 of Alignment or add ability to tweak it). Most of blocks look like

463 rectangular part of alignment if you shuffle alignment rows the right way.

464 """

465

466 alignment = None

467 """Alignment the block belongs to."""

468

469 sequences = ()

470 """List of sequences in block."""

471

472 columns = ()

473 """List of columns in block."""

474

475 @classmethod

476 def from_alignment(cls, alignment, sequences=None, columns=None):

477 """Build new block from alignment.

478

479 If sequences are not given, the block uses all sequences in alignment.

480

481 If columns are not given, the block uses all columns in alignment.

482

483 In both cases we use exactly the list used in alignment, thus, if new

484 sequences or columns are added to alignment, the block tracks this too.

485 """

486 if sequences is None:

487 sequences = alignment.sequences

488 if columns is None:

489 columns = alignment.columns

490 block = cls()

491 block.alignment = alignment

492 block.sequences = sequences

493 block.columns = columns

494 return block

495

496 # vim: set ts=4 sts=4 sw=4 et: