allpy: 6ffdf051caea allpy/base.py

allpy

view allpy/base.py @ 721:6ffdf051caea

Mostly meaningless commit

author	Boris Burkov <BurkovBA@gmail.com>
date	Wed, 06 Jul 2011 21:16:41 +0400
parents	b0d3c9413cf9
children	b71f9c1d1509

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 cls.by_code3[code3] = TheMonomer

56 cls.by_name[name] = TheMonomer

57 # We duplicate distinguished long names into Monomer itself, so that we

58 # can use Monomer.from_code3 to create the relevant type of monomer.

59 Monomer.by_code3[code3] = TheMonomer

60 Monomer.by_name[name] = TheMonomer

62 @classmethod

63 def _initialize(cls, codes=None):

64 """Create all relevant subclasses of Monomer."""

65 for code1, is_modified, code3, name in codes:

66 cls._subclass(name, code1, code3, is_modified)

68 @classmethod

69 def from_code1(cls, code1):

70 """Create new monomer from 1-letter code."""

71 return cls.by_code1[code1.upper()]()

73 @classmethod

74 def from_code3(cls, code3):

75 """Create new monomer from 3-letter code."""

76 return cls.by_code3[code3.upper()]()

78 @classmethod

79 def from_name(cls, name):

80 """Create new monomer from full name."""

81 return cls.by_name[name.strip().capitalize()]()

83 def __repr__(self):

84 return "<Monomer %s>" % str(self.code1)

86 def __str__(self):

87 """Returns one-letter code"""

88 return self.code1

90 def __eq__(self, other):

91 """Monomers within same monomer type are compared by code1."""

92 if not other:

93 return False

94 assert self.type == other.type

95 return self.code1 == other.code1

97 def __ne__(self, other):

98 return not (self == other)

100 class Sequence(list):

101 """Sequence of Monomers.

102

103 This behaves like list of monomer objects. In addition to standard list

104 behaviour, Sequence has the following attributes:

105

106 * name -- str with the name of the sequence

107 * description -- str with description of the sequence

108 * source -- str denoting source of the sequence

109

110 Any of them may be empty (i.e. hold empty string)

111 """

112

113 types = base

114 """Mapping of related types. SHOULD be redefined in subclasses."""

115

116 name = ''

117 description = ''

118 source = ''

119

120 @classmethod

121 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

122 """Create sequence from a list of monomer objecst."""

123 result = cls(monomers)

124 if name:

125 result.name = name

126 if description:

127 result.description = description

128 if source:

129 result.source = source

130 return result

131

132 @classmethod

133 def from_string(cls, string, name='', description='', source=''):

134 """Create sequences from string of one-letter codes."""

135 monomer = cls.types.Monomer.from_code1

136 monomers = [monomer(letter) for letter in string]

137 return cls.from_monomers(monomers, name, description, source)

138

139 def __repr__(self):

140 if self.name:

141 return '<Sequence %s>' % str(self.name)

142 else:

143 return '<Sequence %s>' % str(self)

144

145

146 def __str__(self):

147 """Returns sequence of one-letter codes."""

148 return ''.join(monomer.code1 for monomer in self)

149

150 def __hash__(self):

151 """Hash sequence by identity."""

152 return id(self)

153

154

155 class Alignment(object):

156 """Alignment. It is a list of Columns."""

157

158 types = base

159 """Mapping of related types. SHOULD be redefined in subclasses."""

160

161 sequences = None

162 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

163

164 def __init__(self):

165 """Initialize empty alignment."""

166 self.sequences = []

167 self.columns = []

168

169 # Alignment grow & IO methods

170 # ==============================

171

172 def append_sequence(self, sequence):

173 """Add sequence to alignment. Return self.

174

175 If sequence is too short, pad it with gaps on the right.

176 """

177 self.sequences.append(sequence)

178 self._pad_to_width(len(sequence))

179 for column, monomer in zip(self.columns, sequence):

180 column[sequence] = monomer

181 return self

182

183 def append_row_from_string(self, string,

184 name='', description='', source='', gaps=default_gaps):

185 """Add row from a string of one-letter codes and gaps. Return self."""

186 Sequence = self.types.Sequence

187 without_gaps = util.remove_each(string, gaps)

188 sequence = Sequence.from_string(without_gaps, name, description, source)

189 self._pad_to_width(len(string))

190 non_gap_columns = [column

191 for column, char in zip(self.columns, string)

192 if char not in gaps

193 ]

194 for monomer, column in zip(sequence, non_gap_columns):

195 column[sequence] = monomer

196 self.sequences.append(sequence)

197 return self

198

199 def append_row_with_gaps(self, row, sequence):

200 """Add row from row_as_list representation and sequence. Return self."""

201 self.sequences.append(sequence)

202 self._pad_to_width(len(row))

203 for column, monomer in zip(self.columns, row):

204 if monomer:

205 column[sequence] = monomer

206 return self

207

208 def _pad_to_width(self, n):

209 """Pad alignment with empty columns on the right to width n."""

210 for i in range(len(self.columns), n):

211 self.columns.append(Column(self))

212

213 def append_file(self, file, format='fasta', gaps=default_gaps):

214 """Append sequences from file to alignment. Return self.

215

216 If sequences in file have gaps (detected as characters belonging to

217 `gaps` set), treat them accordingly.

218 """

219 sequences = []

220 io = fileio.File(file, format)

221 for name, description, body in io.read_strings():

222 self.append_row_from_string(body, name, description, file.name, gaps)

223 return self

224

225 def to_file(self, file, format='fasta', gap='-'):

226 """Write alignment in FASTA file as sequences with gaps."""

227 strings = [(s, s.sequence.name, s.sequence.description)

228 for s in self.rows_as_strings()]

229 fileio.File(file, format).write_strings(strings)

230

231 # Data access methods for alignment

232 # =================================

233

234 def rows(self):

235 """Return list of rows (temporary objects) in alignment.

236

237 Each row is a dictionary of { column : monomer }.

238

239 For gap positions there is no key for the column in row.

240

241 Each row has attribute `sequence` pointing to the sequence the row is

242 describing.

243

244 Modifications of row have no effect on the alignment.

245 """

246 # For now, the function returns a list rather than iterator.

247 # It is yet to see, whether memory performance here becomes critical,

248 # or is random access useful.

249 rows = []

250 for sequence in self.sequences:

251 row = util.UserDict()

252 row.sequence = sequence

253 for column in self.columns:

254 if sequence in column:

255 row[column] = column[sequence]

256 rows.append(row)

257 return rows

258

259 def rows_as_lists(self):

260 """Return list of rows (temporary objects) in alignment.

261

262 Each row here is a list of either monomer or None (for gaps).

263

264 Each row has attribute `sequence` pointing to the sequence of row.

265

266 Modifications of row have no effect on the alignment.

267 """

268 rows = []

269 for sequence in self.sequences:

270 row = util.UserList()

271 row.sequence = sequence

272 for column in self.columns:

273 row.append(column.get(sequence))

274 rows.append(row)

275 return rows

276

277 def rows_as_strings(self, gap='-'):

278 """Return list of string representation of rows in alignment.

279

280 Each row has attribute `sequence` pointing to the sequence of row.

281

282 `gap` is the symbol to use for gap.

283 """

284 rows = []

285 for sequence in self.sequences:

286 string = ""

287 for column in self.columns:

288 if sequence in column:

289 string += column[sequence].code1

290 else:

291 string += gap

292 string = util.UserString(string)

293 string.sequence = sequence

294 rows.append(string)

295 return rows

296

297 def row_as_list(self, sequence):

298 """Return representaion of row as list with `Monomers` and `None`s."""

299 return [column.get(sequence) for column in self.columns]

300

301 def row_as_string(self, sequence, gap='-'):

302 """Return string representaion of row in alignment.

303

304 String will have gaps represented by `gap` symbol (defaults to '-').

305 """

306 def char(monomer):

307 if monomer:

308 return monomer.code1

309 return gap

310 row = self.row_as_list(sequence)

311 return "".join(map(char, row))

312

313 def columns_as_lists(self):

314 """Return list of columns (temorary objects) in alignment.

315

316 Each column here is a list of either monomer or None (for gaps).

317

318 Items of column are sorted in the same way as alignment.sequences.

319

320 Modifications of column have no effect on the alignment.

321 """

322 columns = []

323 for column in self.columns:

324 col = []

325 for sequence in self.sequences:

326 col.append(column.get(sequence))

327 columns.append(col)

328 return columns

329

330 # Alignment / Block editing methods

331 # =================================

332

333 def flush(self, whence='left'):

334 """Remove all gaps from alignment and flush results to one side.

335

336 `whence` must be one of 'left', 'right' or 'center'

337 """

338 if whence == 'left':

339 from processors import Left as Flush

340 elif whence == 'right':

341 from processors import Right as Flush

342 elif whence == 'center':

343 from processors import Center as Flush

344 else:

345 raise AssertionError, "Whence must be left, right or center"

346 self.realign(Flush())

347

348 def remove_gap_columns(self):

349 """Remove all empty columns."""

350 for n, column in reversed(list(enumerate(self.columns))):

351 if column == {}:

352 self.columns[n:n+1] = []

353

354 def _wipe_row(self, sequence):

355 """Turn all row positions into gaps (but keep sequences intact)."""

356 for column in self.columns:

357 if sequence in column:

358 del column[sequence]

359

360 def _merge(self, dst, new, merge):

361 """Replace contents of `dst` with those of `new`.

362

363 Replace contents of elements using function `merge(dst_el, new_le)`.

364 """

365 for el, new_el in zip(dst, new):

366 merge(el, new_el)

367 dst[len(dst):] = new[len(dst):]

368 del dst[len(new):]

369

370 def _replace_sequence_contents(self, new, copy_descriptions):

371 """Replace contents of sequences with those of `new` alignment."""

372 # XXX: we manually copy sequence contents here

373 # XXX: we only copy, overlapping parts and link to the rest

374 def merge_monomers(dst, new):

375 dst.__class__ = new.__class__

376 def merge_sequences(dst, new):

377 if copy_descriptions:

378 vars(dst).update(vars(new))

379 self._merge(dst, new, merge_monomers)

380 self._merge(self.sequences, new.sequences, merge_sequences)

381

382 def _replace_column_contents(self, new):

383 """Replace column contents with those of `new` alignment.

384

385 In other words: copy gap patterns from `new` to `self`.

386

387 `self.sequences` and `new.sequences` should have the same contents.

388 """

389 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

390 sequence = row.sequence

391 monomers = filter(None, row)

392 assert len(monomers) == len(filter(None, new_row))

393 self._wipe_row(sequence)

394 non_gap_columns = [column

395 for column, monomer in zip(self.columns, new_row)

396 if monomer

397 ]

398 for monomer, column in zip(monomers, non_gap_columns):

399 column[sequence] = monomer

400

401 def _replace_contents(self, new, copy_descriptions, copy_contents):

402 """Replace alignment contents with those of other alignment."""

403 if copy_contents:

404 self._replace_sequence_contents(new, copy_descriptions)

405 self._replace_column_contents(new)

406

407 def process(self, function, copy_descriptions=True, copy_contents=True):

408 """Apply function to the alignment (or block); inject results back.

409

410 - `function(block)` must return block with same line order.

411 - if `copy_descriptions` is False, ignore new sequence names.

412 - if `copy_contents` is False, don't copy sequence contents too.

413

414 `function` (object) may have attributes `copy_descriptions` and

415 `copy_contents`, which override the same named arguments.

416 """

417 new = function(self)

418 if hasattr(function, 'copy_descriptions'):

419 copy_descriptions = function.copy_descriptions

420 if hasattr(function, 'copy_contents'):

421 copy_contents = function.copy_contents

422 self._replace_contents(new, copy_descriptions, copy_contents)

423

424 def realign(self, function):

425 """Realign self.

426

427 I.e.: apply function to self to produce a new alignment, then update

428 self to have the same gap patterns as the new alignment.

429

430 This is the same as process(function, False, False)

431 """

432 new = function(self)

433 self._replace_column_contents(new)

434

435 class Column(dict):

436 """Column of alignment.

437

438 Column is a dict of { sequence : monomer }.

439

440 For sequences that have gaps in current row, given key is not present in

441 the column.

442 """

443

444 types = base

445 """Mapping of related types. SHOULD be redefined in subclasses."""

446

447 def __init__(self, alignment):

448 self.alignment = alignment

449 super(Column, self).__init__()

450

451 def __hash__(self):

452 """Return hash by identity."""

453 return id(self)

454

455 def MyIndex(self):

456 for index, column in enumerate(self.alignment.columns):

457 if column is self: return index

458 raise ValueException

459

460 def __repr__(self):

461 #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES

462 return "<Column %s>"%(str(self.MyIndex()))

467 class Block(Alignment):

468 """Block of alignment.

469

470 Block is an intersection of several rows & columns. (The collections of

471 rows and columns are represented as ordered lists, to retain display order

472 of Alignment or add ability to tweak it). Most of blocks look like

473 rectangular part of alignment if you shuffle alignment rows the right way.

474 """

475

476 alignment = None

477 """Alignment the block belongs to."""

478

479 sequences = ()

480 """List of sequences in block."""

481

482 columns = ()

483 """List of columns in block."""

484

485 @classmethod

486 def from_alignment(cls, alignment, sequences=None, columns=None):

487 """Build new block from alignment.

488

489 If sequences are not given, the block uses all sequences in alignment.

490

491 If columns are not given, the block uses all columns in alignment.

492

493 In both cases we use exactly the list used in alignment, thus, if new

494 sequences or columns are added to alignment, the block tracks this too.

495 """

496 if sequences is None:

497 sequences = alignment.sequences

498 if columns is None:

499 columns = alignment.columns

500 block = cls()

501 block.alignment = alignment

502 block.sequences = sequences

503 block.columns = columns

504 return block

505

506 # vim: set ts=4 sts=4 sw=4 et: