Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/27733e1f2be0/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:03:39 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 362:27733e1f2be0

Renamed Alignment.from_file -> Alignment.append_file
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Wed, 26 Jan 2011 20:45:34 +0300
parents 0bdf8e55dd86
children 5643facbe8e7
rev   line source
me@261 1 import sys
bnagaev@357 2 import re
me@261 3
me@315 4 import util
me@284 5 import fasta
me@260 6
me@306 7 default_gaps = set((".", "-", "~"))
me@306 8 """Set of characters to recoginze as gaps when parsing alignment."""
me@306 9
me@328 10 class Monomer(object):
me@328 11 """Monomer object."""
me@260 12
me@328 13 type = None
me@328 14 """Either of 'dna', 'rna', 'protein'."""
me@260 15
me@260 16 by_code1 = {}
me@328 17 """A mapping from 1-letter code to Monomer subclass."""
me@328 18
me@260 19 by_code3 = {}
me@328 20 """A mapping from 3-letter code to Monomer subclass."""
me@328 21
me@260 22 by_name = {}
me@328 23 """A mapping from full monomer name to Monomer subclass."""
me@260 24
me@260 25 @classmethod
me@328 26 def _subclass(cls, name='', code1='', code3='', is_modified=False):
me@328 27 """Create new subclass of Monomer for given monomer type."""
me@328 28 class TheMonomer(cls):
me@328 29 pass
me@328 30 name = name.strip().capitalize()
me@328 31 code1 = code1.upper()
me@328 32 code3 = code3.upper()
bnagaev@357 33 TheMonomer.__name__ = re.sub(r"[^\w]", "_", name)
me@328 34 TheMonomer.name = name
me@328 35 TheMonomer.code1 = code1
me@328 36 TheMonomer.code3 = code3
me@328 37 TheMonomer.is_modified = is_modified
me@328 38 if not is_modified:
me@328 39 cls.by_code1[code1] = TheMonomer
me@328 40 cls.by_code3[code3] = TheMonomer
me@328 41 cls.by_name[name] = TheMonomer
me@328 42 # We duplicate distinguished long names into Monomer itself, so that we
me@328 43 # can use Monomer.from_code3 to create the relevant type of monomer.
me@328 44 Monomer.by_code3[code3] = TheMonomer
me@328 45 Monomer.by_name[name] = TheMonomer
me@260 46
me@328 47 @classmethod
me@353 48 def _initialize(cls, codes=None):
me@328 49 """Create all relevant subclasses of Monomer."""
me@328 50 # NB. The table uses letters d, r, p for types,
me@328 51 # while we use full words; hence, we compare by first letter
me@260 52 for type, code1, is_modified, code3, name in codes:
me@328 53 if type[0] == cls.type[0]:
me@328 54 cls._subclass(name, code1, code3, is_modified)
me@260 55
me@260 56 @classmethod
me@260 57 def from_code1(cls, code1):
me@328 58 """Create new monomer from 1-letter code."""
me@328 59 return cls.by_code1[code1.upper()]()
me@260 60
me@260 61 @classmethod
me@260 62 def from_code3(cls, code3):
me@328 63 """Create new monomer from 3-letter code."""
me@328 64 return cls.by_code3[code3.upper()]()
me@260 65
me@260 66 @classmethod
me@260 67 def from_name(cls, name):
me@328 68 """Create new monomer from full name."""
me@328 69 return cls.by_name[name.strip().capitalize()]()
me@260 70
me@329 71 def __repr__(self):
me@329 72 return '<Monomer %s>' % self.code3
me@329 73
me@329 74 def __str__(self):
me@329 75 """Returns one-letter code"""
me@329 76 return self.code1
me@329 77
me@260 78 def __eq__(self, other):
me@328 79 """Monomers within same monomer type are compared by code1."""
me@328 80 assert self.type == other.type
me@328 81 return self.code1 == other.code1
bnagaev@239 82
bnagaev@239 83 class Sequence(list):
me@274 84 """Sequence of Monomers.
bnagaev@243 85
me@274 86 This behaves like list of monomer objects. In addition to standard list
me@274 87 behaviour, Sequence has the following attributes:
me@270 88
me@274 89 * name -- str with the name of the sequence
me@274 90 * description -- str with description of the sequence
me@274 91 * source -- str denoting source of the sequence
me@266 92
me@274 93 Any of them may be empty (i.e. hold empty string)
me@275 94
me@275 95 Class attributes:
me@282 96
me@275 97 * monomer_type -- type of monomers in sequence, must be redefined when
me@275 98 subclassing
me@274 99 """
me@270 100
me@275 101 monomer_type = Monomer
me@270 102
me@275 103 name = ''
me@275 104 description = ''
me@275 105 source = ''
me@275 106
me@347 107 @classmethod
me@347 108 def from_monomers(cls, monomers=[], name=None, description=None, source=None):
me@347 109 """Create sequence from a list of monomer objecst."""
me@347 110 result = cls()
me@275 111 if name:
me@347 112 result.name = name
me@275 113 if description:
me@347 114 result.description = description
me@275 115 if source:
me@347 116 result.source = source
me@347 117 return result
me@347 118
me@347 119 @classmethod
me@347 120 def from_string(cls, string, name='', description='', source=''):
me@347 121 """Create sequences from string of one-letter codes."""
me@347 122 monomer = cls.monomer_type.from_code1
me@347 123 monomers = [monomer(letter) for letter in string]
me@347 124 return cls.from_monomers(monomers, name, description, source)
me@270 125
me@329 126 def __repr__(self):
me@329 127 return '<Sequence %s>' % str(self)
me@329 128
me@262 129 def __str__(self):
me@329 130 """Returns sequence of one-letter codes."""
me@275 131 return ''.join(monomer.code1 for monomer in self)
me@270 132
me@316 133 def __hash__(self):
me@316 134 """Hash sequence by identity."""
me@316 135 return id(self)
me@316 136
me@295 137 class Alignment(object):
me@295 138 """Alignment. It is a list of Columns."""
bnagaev@249 139
me@287 140 sequence_type = Sequence
me@289 141 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
me@288 142
me@289 143 sequences = None
me@289 144 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249 145
me@287 146 def __init__(self):
me@287 147 """Initialize empty alignment."""
me@287 148 self.sequences = []
me@295 149 self.columns = []
me@282 150
me@362 151 # Alignment grow & IO methods
me@299 152 # ==============================
me@299 153
me@294 154 def append_sequence(self, sequence):
me@294 155 """Add sequence to alignment.
me@294 156
me@294 157 If sequence is too short, pad it with gaps on the right.
me@294 158 """
me@294 159 self.sequences.append(sequence)
me@294 160 for i, monomer in enumerate(sequence):
me@302 161 self.column_at(i)[sequence] = monomer
me@294 162
me@349 163 def append_row(self, string, name='', description='', source='',
me@306 164 gaps=default_gaps):
me@349 165 """Add row from a string of one-letter codes and gaps."""
me@313 166 Sequence = self.sequence_type
me@306 167 not_gap = lambda (i, char): char not in gaps
me@349 168 without_gaps = util.remove_each(string, gaps)
me@321 169 sequence = Sequence.from_string(without_gaps, name, description, source)
me@303 170 # The following line has some simple magic:
me@303 171 # 1. attach natural numbers to monomers
me@303 172 # 2. delete gaps
me@303 173 # 3. attach numbers again
me@303 174 # This way we have a pair of numbers attached to monomer:
me@303 175 # - it's position in alignment (the first attached number, j)
me@303 176 # - it's position in sequence (the second attached number, i)
me@349 177 for i, (j, char) in enumerate(filter(not_gap, enumerate(string))):
me@313 178 self.column_at(j)[sequence] = sequence[i]
me@287 179 self.sequences.append(sequence)
me@287 180
me@302 181 def column_at(self, n):
me@302 182 """Return column by index. Create required new columns if required.
me@302 183
me@302 184 Do NOT use this method, unless you are sure it is what you want.
me@302 185 """
me@302 186 for i in range(len(self.columns), n + 1):
me@302 187 self.columns.append(Column())
me@302 188 return self.columns[n]
me@302 189
me@362 190 def append_file(self, file, format='fasta', gaps=default_gaps):
me@362 191 """Append sequences from file to alignment.
me@299 192
me@362 193 If sequences in file have gaps (detected as characters belonging to
me@362 194 `gaps` set), treat them accordingly.
me@362 195
me@362 196 Return self.
me@362 197 """
me@349 198 assert format == 'fasta'
me@313 199 for (name, description, body) in fasta.parse_file(file):
me@349 200 self.append_row(body, name, description, file.name, gaps)
me@287 201 return self
bnagaev@249 202
me@292 203 def to_fasta(self, file):
me@292 204 """Write alignment in FASTA file as sequences with gaps."""
me@292 205 def char(monomer):
me@292 206 if monomer:
me@292 207 return monomer.code1
me@292 208 return "-"
me@292 209 for row in self.rows_as_lists():
me@292 210 seq = row.sequence
me@292 211 line = "".join(map(char, row))
me@292 212 fasta.save_file(file, line, seq.name, seq.description)
me@292 213
me@299 214 # Data access methods for alignment
me@299 215 # =================================
me@299 216
me@299 217 def rows(self):
me@299 218 """Return list of rows (temporary objects) in alignment.
me@299 219
me@299 220 Each row is a dictionary of { column : monomer }.
me@299 221
me@299 222 For gap positions there is no key for the column in row.
me@299 223
me@299 224 Each row has attribute `sequence` pointing to the sequence the row is
me@299 225 describing.
me@299 226
me@299 227 Modifications of row have no effect on the alignment.
me@299 228 """
me@299 229 # For now, the function returns a list rather than iterator.
me@299 230 # It is yet to see, whether memory performance here becomes critical,
me@299 231 # or is random access useful.
me@299 232 rows = []
me@299 233 for sequence in self.sequences:
me@299 234 row = util.UserDict()
me@299 235 row.sequence = sequence
me@299 236 for column in self.columns:
me@299 237 if sequence in column:
me@299 238 row[column] = column[sequence]
me@299 239 rows.append(row)
me@299 240 return rows
me@299 241
me@299 242 def rows_as_lists(self):
me@299 243 """Return list of rows (temporary objects) in alignment.
me@299 244
me@299 245 Each row here is a list of either monomer or None (for gaps).
me@299 246
me@299 247 Each row has attribute `sequence` pointing to the sequence of row.
me@299 248
me@299 249 Modifications of row have no effect on the alignment.
me@299 250 """
me@299 251 rows = []
me@299 252 for sequence in self.sequences:
me@299 253 row = util.UserList()
me@299 254 row.sequence = sequence
me@299 255 for column in self.columns:
me@299 256 row.append(column.get(sequence))
me@299 257 rows.append(row)
me@299 258 return rows
me@299 259
me@299 260 def columns_as_lists(self):
me@299 261 """Return list of columns (temorary objects) in alignment.
me@299 262
me@299 263 Each column here is a list of either monomer or None (for gaps).
me@299 264
me@299 265 Items of column are sorted in the same way as alignment.sequences.
me@299 266
me@299 267 Modifications of column have no effect on the alignment.
me@299 268 """
me@299 269 columns = []
me@299 270 for column in self.columns:
me@299 271 col = []
me@299 272 for sequence in self.sequences:
me@299 273 col.append(column.get(sequence))
me@299 274 columns.append(col)
me@299 275 return columns
me@299 276
me@300 277 class Column(dict):
me@300 278 """Column of alignment.
me@300 279
me@300 280 Column is a dict of { sequence : monomer }.
me@300 281
me@300 282 For sequences that have gaps in current row, given key is not present in
me@300 283 the column.
me@300 284 """
me@325 285
me@325 286 def __hash__(self):
me@325 287 """Return hash by identity."""
me@325 288 return id(self)
me@300 289
me@317 290 class Block(Alignment):
me@307 291 """Block of alignment.
me@301 292
me@307 293 Block is intersection of a set of columns & a set of rows. Most of blocks
me@307 294 look like rectangular part of alignment if you shuffle alignment rows the
me@307 295 right way.
me@261 296 """
me@270 297
me@307 298 alignment = None
me@307 299 """Alignment the block belongs to."""
me@270 300
me@307 301 sequences = ()
me@307 302 """List of sequences in block."""
me@307 303
me@307 304 columns = ()
me@307 305 """List of columns in block."""
me@307 306
me@317 307 @classmethod
me@317 308 def from_alignment(cls, alignment, sequences=None, columns=None):
me@307 309 """Build new block from alignment.
me@307 310
me@307 311 If sequences are not given, the block uses all sequences in alignment.
me@307 312
me@307 313 If columns are not given, the block uses all columns in alignment.
me@307 314
me@307 315 In both cases we use exactly the list used in alignment, thus, if new
me@307 316 sequences or columns are added to alignment, the block tracks this too.
me@261 317 """
me@307 318 if sequences is None:
me@307 319 sequences = alignment.sequences
me@318 320 if columns is None:
me@307 321 columns = alignment.columns
me@320 322 block = cls()
me@320 323 block.alignment = alignment
me@320 324 block.sequences = sequences
me@320 325 block.columns = columns
me@320 326 return block
me@270 327
me@312 328 def flush_left(self):
me@312 329 """Move all monomers to the left, gaps to the right within block."""
me@312 330 padding = [None] * len(self.columns)
me@312 331 for row in self.rows_as_lists():
me@312 332 sequence = row.sequence
me@312 333 row = filter(None, row) + padding
me@312 334 for monomer, column in zip(row, self.columns):
me@312 335 if monomer:
me@312 336 column[sequence] = monomer
me@312 337 elif sequence in column:
me@312 338 del column[sequence]
me@312 339
me@312 340
me@260 341 # vim: set ts=4 sts=4 sw=4 et: