allpy: 2f016bfda114 allpy/base.py

allpy

view allpy/base.py @ 669:2f016bfda114

Massive changes to my program

author	Boris Burkov <BurkovBA@gmail.com>
date	Fri, 01 Jul 2011 09:52:49 +0400
parents	c32e728b98d7
children	1a4a04829ba6

line source

1 import sys

2 import re

4 import util

5 import fileio

6 import data.monomers

8 # import this very module as means of having all related classes in one place

9 import base

11 default_gaps = set((".", "-", "~"))

12 """Set of characters to recoginze as gaps when parsing alignment."""

14 class Monomer(object):

15 """Monomer object."""

17 type = None

18 """Either of 'dna', 'rna', 'protein'."""

20 types = base

21 """Mapping of related types. SHOULD be redefined in subclasses."""

23 by_code1 = {}

24 """A mapping from 1-letter code to Monomer subclass."""

26 by_code3 = {}

27 """A mapping from 3-letter code to Monomer subclass."""

29 by_name = {}

30 """A mapping from full monomer name to Monomer subclass."""

32 @classmethod

33 def _subclass(cls, name='', code1='', code3='', is_modified=False):

34 """Create new subclass of Monomer for given monomer type."""

35 class TheMonomer(cls):

36 pass

37 name = name.strip().capitalize()

38 code1 = code1.upper()

39 code3 = code3.upper()

40 module = vars(data.monomers)[cls.type]

41 TheMonomer.__name__ = re.sub(r"\W", "_", name)

42 TheMonomer.__module__ = module.__name__

43 TheMonomer.name = name

44 TheMonomer.code1 = code1

45 TheMonomer.code3 = code3

46 TheMonomer.is_modified = is_modified

47 # Save the class in data.monomers so that it can be pickled

48 # Some names are not unique, we append underscores to them

49 # in order to fix it.

50 while TheMonomer.__name__ in vars(module):

51 TheMonomer.__name__ += "_"

52 vars(module)[TheMonomer.__name__] = TheMonomer

53 if not is_modified:

54 cls.by_code1[code1] = TheMonomer

55 cls.by_code3[code3] = TheMonomer

56 cls.by_name[name] = TheMonomer

57 # We duplicate distinguished long names into Monomer itself, so that we

58 # can use Monomer.from_code3 to create the relevant type of monomer.

59 Monomer.by_code3[code3] = TheMonomer

60 Monomer.by_name[name] = TheMonomer

62 @classmethod

63 def _initialize(cls, codes=None):

64 """Create all relevant subclasses of Monomer."""

65 for code1, is_modified, code3, name in codes:

66 cls._subclass(name, code1, code3, is_modified)

68 @classmethod

69 def from_code1(cls, code1):

70 """Create new monomer from 1-letter code."""

71 return cls.by_code1[code1.upper()]()

73 @classmethod

74 def from_code3(cls, code3):

75 """Create new monomer from 3-letter code."""

76 return cls.by_code3[code3.upper()]()

78 @classmethod

79 def from_name(cls, name):

80 """Create new monomer from full name."""

81 return cls.by_name[name.strip().capitalize()]()

83 def __repr__(self):

84 return "<Monomer %s>" % str(self.code1)

86 def __str__(self):

87 """Returns one-letter code"""

88 return self.code1

90 def __eq__(self, other):

91 """Monomers within same monomer type are compared by code1."""

92 #import traceback

93 #for entry in traceback.extract_stack():

94 # if "remove_contained_preblocks" in entry[2]:

95 # print traceback.extract_stack()

96 if not other:

97 return False

98 assert self.type == other.type

99 return self.code1 == other.code1

100

101 def __ne__(self, other):

102 return not (self == other)

103

104 class Sequence(list):

105 """Sequence of Monomers.

106

107 This behaves like list of monomer objects. In addition to standard list

108 behaviour, Sequence has the following attributes:

109

110 * name -- str with the name of the sequence

111 * description -- str with description of the sequence

112 * source -- str denoting source of the sequence

113

114 Any of them may be empty (i.e. hold empty string)

115 """

116

117 types = base

118 """Mapping of related types. SHOULD be redefined in subclasses."""

119

120 name = ''

121 description = ''

122 source = ''

123

124 @classmethod

125 def from_monomers(cls, monomers=[], name=None, description=None, source=None):

126 """Create sequence from a list of monomer objecst."""

127 result = cls(monomers)

128 if name:

129 result.name = name

130 if description:

131 result.description = description

132 if source:

133 result.source = source

134 return result

135

136 @classmethod

137 def from_string(cls, string, name='', description='', source=''):

138 """Create sequences from string of one-letter codes."""

139 monomer = cls.types.Monomer.from_code1

140 monomers = [monomer(letter) for letter in string]

141 return cls.from_monomers(monomers, name, description, source)

142

143 def __repr__(self):

144 if self.name:

145 return '<Sequence %s>' % str(self.name)

146 else:

147 return '<Sequence %s>' % str(self)

148

149

150 def __str__(self):

151 """Returns sequence of one-letter codes."""

152 return ''.join(monomer.code1 for monomer in self)

153

154 def __hash__(self):

155 """Hash sequence by identity."""

156 return id(self)

157

158

159 class Alignment(object):

160 """Alignment. It is a list of Columns."""

161

162 types = base

163 """Mapping of related types. SHOULD be redefined in subclasses."""

164

165 sequences = None

166 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""

167

168 def __init__(self):

169 """Initialize empty alignment."""

170 self.sequences = []

171 self.columns = []

172

173 # Alignment grow & IO methods

174 # ==============================

175

176 def append_sequence(self, sequence):

177 """Add sequence to alignment. Return self.

178

179 If sequence is too short, pad it with gaps on the right.

180 """

181 self.sequences.append(sequence)

182 self._pad_to_width(len(sequence))

183 for column, monomer in zip(self.columns, sequence):

184 column[sequence] = monomer

185 return self

186

187 def append_row_from_string(self, string,

188 name='', description='', source='', gaps=default_gaps):

189 """Add row from a string of one-letter codes and gaps. Return self."""

190 Sequence = self.types.Sequence

191 without_gaps = util.remove_each(string, gaps)

192 sequence = Sequence.from_string(without_gaps, name, description, source)

193 self._pad_to_width(len(string))

194 non_gap_columns = [column

195 for column, char in zip(self.columns, string)

196 if char not in gaps

197 ]

198 for monomer, column in zip(sequence, non_gap_columns):

199 column[sequence] = monomer

200 self.sequences.append(sequence)

201 return self

202

203 def append_row_with_gaps(self, row, sequence):

204 """Add row from row_as_list representation and sequence. Return self."""

205 self.sequences.append(sequence)

206 self._pad_to_width(len(row))

207 for column, monomer in zip(self.columns, row):

208 if monomer:

209 column[sequence] = monomer

210 return self

211

212 def _pad_to_width(self, n):

213 """Pad alignment with empty columns on the right to width n."""

214 for i in range(len(self.columns), n):

215 self.columns.append(Column(self))

216

217 def append_file(self, file, format='fasta', gaps=default_gaps):

218 """Append sequences from file to alignment. Return self.

219

220 If sequences in file have gaps (detected as characters belonging to

221 `gaps` set), treat them accordingly.

222 """

223 sequences = []

224 io = fileio.File(file, format)

225 for name, description, body in io.read_strings():

226 self.append_row_from_string(body, name, description, file.name, gaps)

227 return self

228

229 def to_file(self, file, format='fasta', gap='-'):

230 """Write alignment in FASTA file as sequences with gaps."""

231 strings = [(s, s.sequence.name, s.sequence.description)

232 for s in self.rows_as_strings()]

233 fileio.File(file, format).write_strings(strings)

234

235 # Data access methods for alignment

236 # =================================

237

238 def rows(self):

239 """Return list of rows (temporary objects) in alignment.

240

241 Each row is a dictionary of { column : monomer }.

242

243 For gap positions there is no key for the column in row.

244

245 Each row has attribute `sequence` pointing to the sequence the row is

246 describing.

247

248 Modifications of row have no effect on the alignment.

249 """

250 # For now, the function returns a list rather than iterator.

251 # It is yet to see, whether memory performance here becomes critical,

252 # or is random access useful.

253 rows = []

254 for sequence in self.sequences:

255 row = util.UserDict()

256 row.sequence = sequence

257 for column in self.columns:

258 if sequence in column:

259 row[column] = column[sequence]

260 rows.append(row)

261 return rows

262

263 def rows_as_lists(self):

264 """Return list of rows (temporary objects) in alignment.

265

266 Each row here is a list of either monomer or None (for gaps).

267

268 Each row has attribute `sequence` pointing to the sequence of row.

269

270 Modifications of row have no effect on the alignment.

271 """

272 rows = []

273 for sequence in self.sequences:

274 row = util.UserList()

275 row.sequence = sequence

276 for column in self.columns:

277 row.append(column.get(sequence))

278 rows.append(row)

279 return rows

280

281 def rows_as_strings(self, gap='-'):

282 """Return list of string representation of rows in alignment.

283

284 Each row has attribute `sequence` pointing to the sequence of row.

285

286 `gap` is the symbol to use for gap.

287 """

288 rows = []

289 for sequence in self.sequences:

290 string = ""

291 for column in self.columns:

292 if sequence in column:

293 string += column[sequence].code1

294 else:

295 string += gap

296 string = util.UserString(string)

297 string.sequence = sequence

298 rows.append(string)

299 return rows

300

301 def row_as_list(self, sequence):

302 """Return representaion of row as list with `Monomers` and `None`s."""

303 return [column.get(sequence) for column in self.columns]

304

305 def row_as_string(self, sequence, gap='-'):

306 """Return string representaion of row in alignment.

307

308 String will have gaps represented by `gap` symbol (defaults to '-').

309 """

310 def char(monomer):

311 if monomer:

312 return monomer.code1

313 return gap

314 row = self.row_as_list(sequence)

315 return "".join(map(char, row))

316

317 def columns_as_lists(self):

318 """Return list of columns (temorary objects) in alignment.

319

320 Each column here is a list of either monomer or None (for gaps).

321

322 Items of column are sorted in the same way as alignment.sequences.

323

324 Modifications of column have no effect on the alignment.

325 """

326 columns = []

327 for column in self.columns:

328 col = []

329 for sequence in self.sequences:

330 col.append(column.get(sequence))

331 columns.append(col)

332 return columns

333

334 # Alignment / Block editing methods

335 # =================================

336

337 def flush(self, whence='left'):

338 """Remove all gaps from alignment and flush results to one side.

339

340 `whence` must be one of 'left', 'right' or 'center'

341 """

342 if whence == 'left':

343 from processors import Left as Flush

344 elif whence == 'right':

345 from processors import Right as Flush

346 elif whence == 'center':

347 from processors import Center as Flush

348 else:

349 raise AssertionError, "Whence must be left, right or center"

350 self.realign(Flush())

351

352 def remove_gap_columns(self):

353 """Remove all empty columns."""

354 for n, column in reversed(list(enumerate(self.columns))):

355 if column == {}:

356 self.columns[n:n+1] = []

357

358 def _wipe_row(self, sequence):

359 """Turn all row positions into gaps (but keep sequences intact)."""

360 for column in self.columns:

361 if sequence in column:

362 del column[sequence]

363

364 def _merge(self, dst, new, merge):

365 """Replace contents of `dst` with those of `new`.

366

367 Replace contents of elements using function `merge(dst_el, new_le)`.

368 """

369 for el, new_el in zip(dst, new):

370 merge(el, new_el)

371 dst[len(dst):] = new[len(dst):]

372 del dst[len(new):]

373

374 def _replace_sequence_contents(self, new, copy_descriptions):

375 """Replace contents of sequences with those of `new` alignment."""

376 # XXX: we manually copy sequence contents here

377 # XXX: we only copy, overlapping parts and link to the rest

378 def merge_monomers(dst, new):

379 dst.__class__ = new.__class__

380 def merge_sequences(dst, new):

381 if copy_descriptions:

382 vars(dst).update(vars(new))

383 self._merge(dst, new, merge_monomers)

384 self._merge(self.sequences, new.sequences, merge_sequences)

385

386 def _replace_column_contents(self, new):

387 """Replace column contents with those of `new` alignment.

388

389 In other words: copy gap patterns from `new` to `self`.

390

391 `self.sequences` and `new.sequences` should have the same contents.

392 """

393 for row, new_row in zip(self.rows_as_lists(), new.rows_as_lists()):

394 sequence = row.sequence

395 monomers = filter(None, row)

396 assert len(monomers) == len(filter(None, new_row))

397 self._wipe_row(sequence)

398 non_gap_columns = [column

399 for column, monomer in zip(self.columns, new_row)

400 if monomer

401 ]

402 for monomer, column in zip(monomers, non_gap_columns):

403 column[sequence] = monomer

404

405 def _replace_contents(self, new, copy_descriptions, copy_contents):

406 """Replace alignment contents with those of other alignment."""

407 if copy_contents:

408 self._replace_sequence_contents(new, copy_descriptions)

409 self._replace_column_contents(new)

410

411 def process(self, function, copy_descriptions=True, copy_contents=True):

412 """Apply function to the alignment (or block); inject results back.

413

414 - `function(block)` must return block with same line order.

415 - if `copy_descriptions` is False, ignore new sequence names.

416 - if `copy_contents` is False, don't copy sequence contents too.

417

418 `function` (object) may have attributes `copy_descriptions` and

419 `copy_contents`, which override the same named arguments.

420 """

421 new = function(self)

422 if hasattr(function, 'copy_descriptions'):

423 copy_descriptions = function.copy_descriptions

424 if hasattr(function, 'copy_contents'):

425 copy_contents = function.copy_contents

426 self._replace_contents(new, copy_descriptions, copy_contents)

427

428 def realign(self, function):

429 """Realign self.

430

431 I.e.: apply function to self to produce a new alignment, then update

432 self to have the same gap patterns as the new alignment.

433

434 This is the same as process(function, False, False)

435 """

436 new = function(self)

437 self._replace_column_contents(new)

438

439 class Column(dict):

440 """Column of alignment.

441

442 Column is a dict of { sequence : monomer }.

443

444 For sequences that have gaps in current row, given key is not present in

445 the column.

446 """

447

448 types = base

449 """Mapping of related types. SHOULD be redefined in subclasses."""

450

451 def __init__(self, alignment):

452 self.alignment = alignment

453 super(Column, self).__init__()

454

455 def __hash__(self):

456 """Return hash by identity."""

457 return id(self)

458

459 def MyIndex(self):

460 for index, column in enumerate(self.alignment.columns):

461 if column is self: return index

462 raise ValueException

463

464 def __repr__(self):

465 #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES

466 return "<Column %s>"%(str(self.MyIndex()))

471 class Block(Alignment):

472 """Block of alignment.

473

474 Block is an intersection of several rows & columns. (The collections of

475 rows and columns are represented as ordered lists, to retain display order

476 of Alignment or add ability to tweak it). Most of blocks look like

477 rectangular part of alignment if you shuffle alignment rows the right way.

478 """

479

480 alignment = None

481 """Alignment the block belongs to."""

482

483 sequences = ()

484 """List of sequences in block."""

485

486 columns = ()

487 """List of columns in block."""

488

489 @classmethod

490 def from_alignment(cls, alignment, sequences=None, columns=None):

491 """Build new block from alignment.

492

493 If sequences are not given, the block uses all sequences in alignment.

494

495 If columns are not given, the block uses all columns in alignment.

496

497 In both cases we use exactly the list used in alignment, thus, if new

498 sequences or columns are added to alignment, the block tracks this too.

499 """

500 if sequences is None:

501 sequences = alignment.sequences

502 if columns is None:

503 columns = alignment.columns

504 block = cls()

505 block.alignment = alignment

506 block.sequences = sequences

507 block.columns = columns

508 return block

509

510 # vim: set ts=4 sts=4 sw=4 et: