allpy: 0386dc31676e allpy/base.py

allpy

view allpy/base.py @ 644:0386dc31676e

Added a test for pickling

author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Fri, 03 Jun 2011 16:59:45 +0400
parents	61de9b5744c8
children	88c246f20918 612c618fb7b0

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 TheMonomer.__name__ = re.sub(r"\W", "_", name)

41 TheMonomer.__module__ = data.monomers.__name__

42 TheMonomer.name = name

43 TheMonomer.code1 = code1

44 TheMonomer.code3 = code3

45 TheMonomer.is_modified = is_modified

46 # Save the class in data.monomers so that it can be pickled

47 # Some names are not unique, we append underscores to them

48 # in order to fix it.

49 # XXX: this WILL fail with dna 0AV != rna A2M, which both have

50 # name "2'-O-METHYLADENOSINE 5'-(DIHYDROGEN PHOSPHATE)"

51 while TheMonomer.__name__ in vars(data.monomers):

52 TheMonomer.__name__ += "_"

53 vars(data.monomers)[TheMonomer.__name__] = TheMonomer

54 if not is_modified:

55 cls.by_code1[code1] = TheMonomer

56 cls.by_code3[code3] = TheMonomer

57 cls.by_name[name] = TheMonomer

58 # We duplicate distinguished long names into Monomer itself, so that we

59 # can use Monomer.from_code3 to create the relevant type of monomer.

60 Monomer.by_code3[code3] = TheMonomer

61 Monomer.by_name[name] = TheMonomer

63 @classmethod

64 def _initialize(cls, codes=None):

65 """Create all relevant subclasses of Monomer."""

66 for code1, is_modified, code3, name in codes:

67 cls._subclass(name, code1, code3, is_modified)

69 @classmethod

70 def from_code1(cls, code1):

71 """Create new monomer from 1-letter code."""

72 return cls.by_code1[code1.upper()]()

74 @classmethod

75 def from_code3(cls, code3):

76 """Create new monomer from 3-letter code."""

77 return cls.by_code3[code3.upper()]()

79 @classmethod

80 def from_name(cls, name):

81 """Create new monomer from full name."""

82 return cls.by_name[name.strip().capitalize()]()

84 def __repr__(self):

85 return '<Monomer %s>' % self.code3

87 def __str__(self):

88 """Returns one-letter code"""

89 return self.code1

91 def __eq__(self, other):

92 """Monomers within same monomer type are compared by code1."""

93 if not other:

94 return False

95 assert self.type == other.type

96 return self.code1 == other.code1

98 def __ne__(self, other):

99 return not (self == other)

100

101 class Sequence(list):

102 """Sequence of Monomers.

103

104 This behaves like list of monomer objects. In addition to standard list

105 behaviour, Sequence has the following attributes:

106

107 * name -- str with the name of the sequence

108 * description -- str with description of the sequence

109 * source -- str denoting source of the sequence

110

111 Any of them may be empty (i.e. hold empty string)

112 """

113

114 types = base

115 """Mapping of related types. SHOULD be redefined in subclasses."""

116

117 name = ''

118 description = ''

119 source = ''

120

121 @classmethod

122 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

123 """Create sequence from a list of monomer objecst."""

124 result = cls(monomers)

125 if name:

126 result.name = name

127 if description:

128 result.description = description

129 if source:

130 result.source = source

131 return result

132

133 @classmethod

134 def from_string(cls, string, name='', description='', source=''):

135 """Create sequences from string of one-letter codes."""

136 monomer = cls.types.Monomer.from_code1

137 monomers = [monomer(letter) for letter in string]

138 return cls.from_monomers(monomers, name, description, source)

139

140 def __repr__(self):

141 return '<Sequence %s>' % str(self)

142

143 def __str__(self):

144 """Returns sequence of one-letter codes."""

145 return ''.join(monomer.code1 for monomer in self)

146

147 def __hash__(self):

148 """Hash sequence by identity."""

149 return id(self)

150

151 class Alignment(object):

152 """Alignment. It is a list of Columns."""

153

154 types = base

155 """Mapping of related types. SHOULD be redefined in subclasses."""

156

157 sequences = None

158 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

159

160 def __init__(self):

161 """Initialize empty alignment."""

162 self.sequences = []

163 self.columns = []

164

165 # Alignment grow & IO methods

166 # ==============================

167

168 def append_sequence(self, sequence):

169 """Add sequence to alignment. Return self.

170

171 If sequence is too short, pad it with gaps on the right.

172 """

173 self.sequences.append(sequence)

174 self._pad_to_width(len(sequence))

175 for column, monomer in zip(self.columns, sequence):

176 column[sequence] = monomer

177 return self

178

179 def append_row_from_string(self, string,

180 name='', description='', source='', gaps=default_gaps):

181 """Add row from a string of one-letter codes and gaps. Return self."""

182 Sequence = self.types.Sequence

183 without_gaps = util.remove_each(string, gaps)

184 sequence = Sequence.from_string(without_gaps, name, description, source)

185 self._pad_to_width(len(string))

186 non_gap_columns = [column

187 for column, char in zip(self.columns, string)

188 if char not in gaps

189 ]

190 for monomer, column in zip(sequence, non_gap_columns):

191 column[sequence] = monomer

192 self.sequences.append(sequence)

193 return self

194

195 def append_row_with_gaps(self, row, sequence):

196 """Add row from row_as_list representation and sequence. Return self."""

197 self.sequences.append(sequence)

198 self._pad_to_width(len(row))

199 for column, monomer in zip(self.columns, row):

200 if monomer:

201 column[sequence] = monomer

202 return self

203

204 def _pad_to_width(self, n):

205 """Pad alignment with empty columns on the right to width n."""

206 for i in range(len(self.columns), n):

207 self.columns.append(Column())

208

209 def append_file(self, file, format='fasta', gaps=default_gaps):

210 """Append sequences from file to alignment. Return self.

211

212 If sequences in file have gaps (detected as characters belonging to

213 `gaps` set), treat them accordingly.

214 """

215 sequences = []

216 io = fileio.File(file, format)

217 for name, description, body in io.read_strings():

218 self.append_row_from_string(body, name, description, file.name, gaps)

219 return self

220

221 def to_file(self, file, format='fasta', gap='-'):

222 """Write alignment in FASTA file as sequences with gaps."""

223 strings = [(s, s.sequence.name, s.sequence.description)

224 for s in self.rows_as_strings()]

225 fileio.File(file, format).write_strings(strings)

226

227 # Data access methods for alignment

228 # =================================

229

230 def rows(self):

231 """Return list of rows (temporary objects) in alignment.

232

233 Each row is a dictionary of { column : monomer }.

234

235 For gap positions there is no key for the column in row.

236

237 Each row has attribute `sequence` pointing to the sequence the row is

238 describing.

239

240 Modifications of row have no effect on the alignment.

241 """

242 # For now, the function returns a list rather than iterator.

243 # It is yet to see, whether memory performance here becomes critical,

244 # or is random access useful.

245 rows = []

246 for sequence in self.sequences:

247 row = util.UserDict()

248 row.sequence = sequence

249 for column in self.columns:

250 if sequence in column:

251 row[column] = column[sequence]

252 rows.append(row)

253 return rows

254

255 def rows_as_lists(self):

256 """Return list of rows (temporary objects) in alignment.

257

258 Each row here is a list of either monomer or None (for gaps).

259

260 Each row has attribute `sequence` pointing to the sequence of row.

261

262 Modifications of row have no effect on the alignment.

263 """

264 rows = []

265 for sequence in self.sequences:

266 row = util.UserList()

267 row.sequence = sequence

268 for column in self.columns:

269 row.append(column.get(sequence))

270 rows.append(row)

271 return rows

272

273 def rows_as_strings(self, gap='-'):

274 """Return list of string representation of rows in alignment.

275

276 Each row has attribute `sequence` pointing to the sequence of row.

277

278 `gap` is the symbol to use for gap.

279 """

280 rows = []

281 for sequence in self.sequences:

282 string = ""

283 for column in self.columns:

284 if sequence in column:

285 string += column[sequence].code1

286 else:

287 string += gap

288 string = util.UserString(string)

289 string.sequence = sequence

290 rows.append(string)

291 return rows

292

293 def columns_as_lists(self):

294 """Return list of columns (temorary objects) in alignment.

295

296 Each column here is a list of either monomer or None (for gaps).

297

298 Items of column are sorted in the same way as alignment.sequences.

299

300 Modifications of column have no effect on the alignment.

301 """

302 columns = []

303 for column in self.columns:

304 col = []

305 for sequence in self.sequences:

306 col.append(column.get(sequence))

307 columns.append(col)

308 return columns

309

310 # Alignment / Block editing methods

311 # =================================

312

313 def flush(self, whence='left'):

314 """Remove all gaps from alignment and flush results to one side.

315

316 `whence` must be one of 'left', 'right' or 'center'

317 """

318 if whence == 'left':

319 from processors import Left as Flush

320 elif whence == 'right':

321 from processors import Right as Flush

322 elif whence == 'center':

323 from processors import Center as Flush

324 else:

325 raise AssertionError, "Whence must be left, right or center"

326 self.realign(Flush())

327

328 def remove_gap_columns(self):

329 """Remove all empty columns."""

330 for n, column in reversed(list(enumerate(self.columns))):

331 if column == {}:

332 self.columns[n:n+1] = []

333

334 def _wipe_row(self, sequence):

335 """Turn all row positions into gaps (but keep sequences intact)."""

336 for column in self.columns:

337 if sequence in column:

338 del column[sequence]

339

340 def _merge(self, dst, new, merge):

341 """Replace contents of `dst` with those of `new`.

342

343 Replace contents of elements using function `merge(dst_el, new_le)`.

344 """

345 for el, new_el in zip(dst, new):

346 merge(el, new_el)

347 dst[len(dst):] = new[len(dst):]

348 del dst[len(new):]

349

350 def _replace_sequence_contents(self, new, copy_descriptions):

351 """Replace contents of sequences with those of `new` alignment."""

352 # XXX: we manually copy sequence contents here

353 # XXX: we only copy, overlapping parts and link to the rest

354 def merge_monomers(dst, new):

355 dst.__class__ = new.__class__

356 def merge_sequences(dst, new):

357 if copy_descriptions:

358 vars(dst).update(vars(new))

359 self._merge(dst, new, merge_monomers)

360 self._merge(self.sequences, new.sequences, merge_sequences)

361

362 def _replace_column_contents(self, new):

363 """Replace column contents with those of `new` alignment.

364

365 In other words: copy gap patterns from `new` to `self`.

366

367 `self.sequences` and `new.sequences` should have the same contents.

368 """

369 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

370 sequence = row.sequence

371 monomers = filter(None, row)

372 assert len(monomers) == len(filter(None, new_row))

373 self._wipe_row(sequence)

374 non_gap_columns = [column

375 for column, monomer in zip(self.columns, new_row)

376 if monomer

377 ]

378 for monomer, column in zip(monomers, non_gap_columns):

379 column[sequence] = monomer

380

381 def _replace_contents(self, new, copy_descriptions, copy_contents):

382 """Replace alignment contents with those of other alignment."""

383 if copy_contents:

384 self._replace_sequence_contents(new, copy_descriptions)

385 self._replace_column_contents(new)

386

387 def process(self, function, copy_descriptions=True, copy_contents=True):

388 """Apply function to the alignment (or block); inject results back.

389

390 - `function(block)` must return block with same line order.

391 - if `copy_descriptions` is False, ignore new sequence names.

392 - if `copy_contents` is False, don't copy sequence contents too.

393

394 `function` (object) may have attributes `copy_descriptions` and

395 `copy_contents`, which override the same named arguments.

396 """

397 new = function(self)

398 if hasattr(function, 'copy_descriptions'):

399 copy_descriptions = function.copy_descriptions

400 if hasattr(function, 'copy_contents'):

401 copy_contents = function.copy_contents

402 self._replace_contents(new, copy_descriptions, copy_contents)

403

404 def realign(self, function):

405 """Realign self.

406

407 I.e.: apply function to self to produce a new alignment, then update

408 self to have the same gap patterns as the new alignment.

409

410 This is the same as process(function, False, False)

411 """

412 new = function(self)

413 self._replace_column_contents(new)

414

415 class Column(dict):

416 """Column of alignment.

417

418 Column is a dict of { sequence : monomer }.

419

420 For sequences that have gaps in current row, given key is not present in

421 the column.

422 """

423

424 types = base

425 """Mapping of related types. SHOULD be redefined in subclasses."""

426

427 def __hash__(self):

428 """Return hash by identity."""

429 return id(self)

430

431 class Block(Alignment):

432 """Block of alignment.

433

434 Block is an intersection of several rows & columns. (The collections of

435 rows and columns are represented as ordered lists, to retain display order

436 of Alignment or add ability to tweak it). Most of blocks look like

437 rectangular part of alignment if you shuffle alignment rows the right way.

438 """

439

440 alignment = None

441 """Alignment the block belongs to."""

442

443 sequences = ()

444 """List of sequences in block."""

445

446 columns = ()

447 """List of columns in block."""

448

449 @classmethod

450 def from_alignment(cls, alignment, sequences=None, columns=None):

451 """Build new block from alignment.

452

453 If sequences are not given, the block uses all sequences in alignment.

454

455 If columns are not given, the block uses all columns in alignment.

456

457 In both cases we use exactly the list used in alignment, thus, if new

458 sequences or columns are added to alignment, the block tracks this too.

459 """

460 if sequences is None:

461 sequences = alignment.sequences

462 if columns is None:

463 columns = alignment.columns

464 block = cls()

465 block.alignment = alignment

466 block.sequences = sequences

467 block.columns = columns

468 return block

469

470 # vim: set ts=4 sts=4 sw=4 et: