Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/annotate/2a6c2cffa891/allpy/base.py
Дата изменения: Unknown
Дата индексирования: Sun Mar 2 07:33:19 2014
Кодировка:
allpy: allpy/base.py annotate

allpy

annotate allpy/base.py @ 323:2a6c2cffa891

Fixed allpy.fasta.parse_file to not produce bogus empty sequence
author Daniil Alexeyevsky <me.dendik@gmail.com>
date Fri, 17 Dec 2010 00:38:40 +0300
parents 05983300140a
children 6fdc4406e2fd
rev   line source
me@261 1 import sys
me@261 2
me@315 3 import util
me@284 4 import fasta
me@260 5 import data.codes
me@260 6
me@306 7 default_gaps = set((".", "-", "~"))
me@306 8 """Set of characters to recoginze as gaps when parsing alignment."""
me@306 9
me@260 10 class MonomerType(object):
me@260 11 """Class of monomer types.
me@260 12
me@260 13 Each MonomerType object represents a known monomer type, e.g. Valine,
me@260 14 and is referenced to by each instance of monomer in a given sequence.
me@260 15
me@260 16 - `name`: full name of monomer type
me@260 17 - `code1`: one-letter code
me@260 18 - `code3`: three-letter code
me@260 19 - `is_modified`: either of True or False
me@260 20
me@260 21 class atributes:
me@260 22
me@260 23 - `by_code1`: a mapping from one-letter code to MonomerType object
me@260 24 - `by_code3`: a mapping from three-letter code to MonomerType object
me@260 25 - `by_name`: a mapping from monomer name to MonomerType object
me@260 26 - `instance_type`: class of Monomer objects to use when creating new
me@260 27 objects; this must be redefined in descendent classes
me@260 28
me@260 29 All of the class attributes MUST be redefined when subclassing.
me@260 30 """
me@260 31
me@260 32 by_code1 = {}
me@260 33 by_code3 = {}
me@260 34 by_name = {}
me@260 35 instance_type = None
me@260 36
me@260 37 def __init__(self, name="", code1="", code3="", is_modified=False):
me@310 38 super(MonomerType, self).__init__()
me@260 39 self.name = name.capitalize()
me@260 40 self.code1 = code1.upper()
me@260 41 self.code3 = code3.upper()
me@260 42 self.is_modified = bool(is_modified)
me@260 43 if not is_modified:
me@260 44 self.by_code1[self.code1] = self
me@260 45 self.by_code3[code3] = self
me@260 46 self.by_name[name] = self
me@260 47 # We duplicate distinguished long names into MonomerType itself,
me@260 48 # so that we can use MonomerType.from_code3 to create the relevant
me@260 49 # type of monomer.
me@260 50 MonomerType.by_code3[code3] = self
me@260 51 MonomerType.by_name[name] = self
me@260 52
me@260 53 @classmethod
me@260 54 def _initialize(cls, type_letter, codes=data.codes.codes):
me@260 55 """Create all relevant instances of MonomerType.
me@260 56
me@260 57 `type_letter` is either of:
me@260 58
me@260 59 - 'p' for protein
me@260 60 - 'd' for DNA
me@260 61 - 'r' for RNA
me@260 62
me@260 63 `codes` is a table of monomer codes
me@260 64 """
me@260 65 for type, code1, is_modified, code3, name in codes:
me@260 66 if type == type_letter:
me@260 67 cls(name, code1, code3, is_modified)
me@260 68
me@260 69 @classmethod
me@260 70 def from_code1(cls, code1):
me@260 71 """Return monomer type by one-letter code."""
me@260 72 return cls.by_code1[code1.upper()]
me@260 73
me@260 74 @classmethod
me@260 75 def from_code3(cls, code3):
me@260 76 """Return monomer type by three-letter code."""
me@260 77 return cls.by_code3[code3.upper()]
me@260 78
me@260 79 @classmethod
me@260 80 def from_name(cls, name):
me@260 81 """Return monomer type by name."""
me@260 82 return cls.by_name[name.capitalize()]
me@260 83
me@260 84 def instance(self):
me@260 85 """Create a new monomer of given type."""
me@260 86 return self.instance_type(self)
me@260 87
me@260 88 def __eq__(self, other):
me@260 89 if hasattr(other, "type"):
me@260 90 return self is other.type
me@260 91 return self is other
me@260 92
me@260 93 class Monomer(object):
me@260 94 """Monomer object.
me@260 95
me@260 96 attributes:
me@260 97
me@260 98 - `type`: type of monomer (a MonomerType object)
me@260 99
me@282 100 class attributes:
me@282 101
me@282 102 - `monomer_type`: either MonomerType or one of it's subclasses, it is used
me@282 103 when creating new monomers. It SHOULD be redefined when subclassing
me@282 104 Monomer.
me@260 105 """
me@260 106 monomer_type = MonomerType
me@260 107
me@260 108 def __init__(self, type):
me@310 109 super(Monomer, self).__init__()
me@260 110 self.type = type
me@260 111
me@260 112 @classmethod
me@260 113 def from_code1(cls, code1):
me@260 114 return cls(cls.monomer_type.by_code1[code1.upper()])
me@260 115
me@260 116 @classmethod
me@260 117 def from_code3(cls, code3):
me@260 118 return cls(cls.monomer_type.by_code3[code3.upper()])
me@260 119
me@260 120 @classmethod
me@260 121 def from_name(cls, name):
me@260 122 return cls(cls.monomer_type.by_name[name.capitalize()])
me@260 123
me@322 124 @property
me@322 125 def code1(self):
me@322 126 return self.type.code1
me@322 127
me@322 128 @property
me@322 129 def code3(self):
me@322 130 return self.type.code3
me@322 131
me@322 132 @property
me@322 133 def name(self):
me@322 134 return self.type.name
me@322 135
me@322 136 @property
me@322 137 def is_modified(self):
me@322 138 return self.type.is_modified
me@322 139
me@260 140 def __eq__(self, other):
me@260 141 if hasattr(other, "type"):
me@260 142 return self.type is other.type
me@260 143 return self.type is other
bnagaev@239 144
bnagaev@239 145 class Sequence(list):
me@274 146 """Sequence of Monomers.
bnagaev@243 147
me@274 148 This behaves like list of monomer objects. In addition to standard list
me@274 149 behaviour, Sequence has the following attributes:
me@270 150
me@274 151 * name -- str with the name of the sequence
me@274 152 * description -- str with description of the sequence
me@274 153 * source -- str denoting source of the sequence
me@266 154
me@274 155 Any of them may be empty (i.e. hold empty string)
me@275 156
me@275 157 Class attributes:
me@282 158
me@275 159 * monomer_type -- type of monomers in sequence, must be redefined when
me@275 160 subclassing
me@274 161 """
me@270 162
me@275 163 monomer_type = Monomer
me@270 164
me@275 165 name = ''
me@275 166 description = ''
me@275 167 source = ''
me@275 168
me@275 169 def __init__(self, sequence=[], name=None, description=None, source=None):
me@275 170 super(Sequence, self).__init__(sequence)
me@275 171 if hasattr(sequence, 'name'):
me@275 172 vars(self).update(vars(sequence))
me@275 173 if name:
me@275 174 self.name = name
me@275 175 if description:
me@275 176 self.description = description
me@275 177 if source:
me@275 178 self.source = source
me@270 179
me@262 180 def __str__(self):
me@275 181 """Returns sequence in one-letter code."""
me@275 182 return ''.join(monomer.code1 for monomer in self)
me@270 183
me@316 184 def __hash__(self):
me@316 185 """Hash sequence by identity."""
me@316 186 return id(self)
me@316 187
me@273 188 @classmethod
me@321 189 def from_string(cls, string, name='', description='', source=''):
me@273 190 """Create sequences from string of one-letter codes."""
me@273 191 monomer = cls.monomer_type.from_code1
me@273 192 monomers = [monomer(letter) for letter in string]
me@321 193 return cls(monomers, name, description, source)
me@262 194
me@284 195 @classmethod
me@284 196 def from_fasta(cls, file):
me@284 197 """Read sequence from FASTA file.
me@286 198
me@284 199 File must contain exactly one sequence.
me@284 200 """
me@313 201 sequence, = fasta.parse_file(file)
me@313 202 name, description, body = sequence
me@321 203 return cls.from_string(body, name, description, file.name)
me@284 204
me@295 205 class Alignment(object):
me@295 206 """Alignment. It is a list of Columns."""
bnagaev@249 207
me@287 208 sequence_type = Sequence
me@289 209 """Type of sequences in alignment. SHOULD be redefined when subclassing."""
me@288 210
me@289 211 sequences = None
me@289 212 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!"""
bnagaev@249 213
me@287 214 def __init__(self):
me@287 215 """Initialize empty alignment."""
me@287 216 super(Alignment, self).__init__()
me@287 217 self.sequences = []
me@295 218 self.columns = []
me@282 219
me@299 220 # Alignment modification methods
me@299 221 # ==============================
me@299 222
me@294 223 def append_sequence(self, sequence):
me@294 224 """Add sequence to alignment.
me@294 225
me@294 226 If sequence is too short, pad it with gaps on the right.
me@294 227 """
me@294 228 self.sequences.append(sequence)
me@294 229 for i, monomer in enumerate(sequence):
me@302 230 self.column_at(i)[sequence] = monomer
me@294 231
me@306 232 def append_gapped_line(self, line, name='', description='', source='',
me@306 233 gaps=default_gaps):
me@287 234 """Add row from a line of one-letter codes and gaps."""
me@313 235 Sequence = self.sequence_type
me@306 236 not_gap = lambda (i, char): char not in gaps
me@306 237 without_gaps = util.remove_each(line, gaps)
me@321 238 sequence = Sequence.from_string(without_gaps, name, description, source)
me@303 239 # The following line has some simple magic:
me@303 240 # 1. attach natural numbers to monomers
me@303 241 # 2. delete gaps
me@303 242 # 3. attach numbers again
me@303 243 # This way we have a pair of numbers attached to monomer:
me@303 244 # - it's position in alignment (the first attached number, j)
me@303 245 # - it's position in sequence (the second attached number, i)
me@287 246 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):
me@313 247 self.column_at(j)[sequence] = sequence[i]
me@287 248 self.sequences.append(sequence)
me@287 249
me@302 250 def column_at(self, n):
me@302 251 """Return column by index. Create required new columns if required.
me@302 252
me@302 253 Do NOT use this method, unless you are sure it is what you want.
me@302 254 """
me@302 255 for i in range(len(self.columns), n + 1):
me@302 256 self.columns.append(Column())
me@302 257 return self.columns[n]
me@302 258
me@299 259 # Alignment IO methods
me@299 260 # ====================
me@299 261
me@287 262 @classmethod
me@306 263 def from_fasta(cls, file, gaps=default_gaps):
me@287 264 """Create new alignment from FASTA file."""
me@287 265 self = cls()
me@313 266 for (name, description, body) in fasta.parse_file(file):
me@321 267 self.append_gapped_line(body, name, description, file.name, gaps)
me@287 268 return self
bnagaev@249 269
me@292 270 def to_fasta(self, file):
me@292 271 """Write alignment in FASTA file as sequences with gaps."""
me@292 272 def char(monomer):
me@292 273 if monomer:
me@292 274 return monomer.code1
me@292 275 return "-"
me@292 276 for row in self.rows_as_lists():
me@292 277 seq = row.sequence
me@292 278 line = "".join(map(char, row))
me@292 279 fasta.save_file(file, line, seq.name, seq.description)
me@292 280
me@299 281 # Data access methods for alignment
me@299 282 # =================================
me@299 283
me@299 284 def rows(self):
me@299 285 """Return list of rows (temporary objects) in alignment.
me@299 286
me@299 287 Each row is a dictionary of { column : monomer }.
me@299 288
me@299 289 For gap positions there is no key for the column in row.
me@299 290
me@299 291 Each row has attribute `sequence` pointing to the sequence the row is
me@299 292 describing.
me@299 293
me@299 294 Modifications of row have no effect on the alignment.
me@299 295 """
me@299 296 # For now, the function returns a list rather than iterator.
me@299 297 # It is yet to see, whether memory performance here becomes critical,
me@299 298 # or is random access useful.
me@299 299 rows = []
me@299 300 for sequence in self.sequences:
me@299 301 row = util.UserDict()
me@299 302 row.sequence = sequence
me@299 303 for column in self.columns:
me@299 304 if sequence in column:
me@299 305 row[column] = column[sequence]
me@299 306 rows.append(row)
me@299 307 return rows
me@299 308
me@299 309 def rows_as_lists(self):
me@299 310 """Return list of rows (temporary objects) in alignment.
me@299 311
me@299 312 Each row here is a list of either monomer or None (for gaps).
me@299 313
me@299 314 Each row has attribute `sequence` pointing to the sequence of row.
me@299 315
me@299 316 Modifications of row have no effect on the alignment.
me@299 317 """
me@299 318 rows = []
me@299 319 for sequence in self.sequences:
me@299 320 row = util.UserList()
me@299 321 row.sequence = sequence
me@299 322 for column in self.columns:
me@299 323 row.append(column.get(sequence))
me@299 324 rows.append(row)
me@299 325 return rows
me@299 326
me@299 327 def columns_as_lists(self):
me@299 328 """Return list of columns (temorary objects) in alignment.
me@299 329
me@299 330 Each column here is a list of either monomer or None (for gaps).
me@299 331
me@299 332 Items of column are sorted in the same way as alignment.sequences.
me@299 333
me@299 334 Modifications of column have no effect on the alignment.
me@299 335 """
me@299 336 columns = []
me@299 337 for column in self.columns:
me@299 338 col = []
me@299 339 for sequence in self.sequences:
me@299 340 col.append(column.get(sequence))
me@299 341 columns.append(col)
me@299 342 return columns
me@299 343
me@300 344 class Column(dict):
me@300 345 """Column of alignment.
me@300 346
me@300 347 Column is a dict of { sequence : monomer }.
me@300 348
me@300 349 For sequences that have gaps in current row, given key is not present in
me@300 350 the column.
me@300 351 """
me@300 352 pass
me@300 353
me@317 354 class Block(Alignment):
me@307 355 """Block of alignment.
me@301 356
me@307 357 Block is intersection of a set of columns & a set of rows. Most of blocks
me@307 358 look like rectangular part of alignment if you shuffle alignment rows the
me@307 359 right way.
me@261 360 """
me@270 361
me@307 362 alignment = None
me@307 363 """Alignment the block belongs to."""
me@270 364
me@307 365 sequences = ()
me@307 366 """List of sequences in block."""
me@307 367
me@307 368 columns = ()
me@307 369 """List of columns in block."""
me@307 370
me@317 371 @classmethod
me@317 372 def from_alignment(cls, alignment, sequences=None, columns=None):
me@307 373 """Build new block from alignment.
me@307 374
me@307 375 If sequences are not given, the block uses all sequences in alignment.
me@307 376
me@307 377 If columns are not given, the block uses all columns in alignment.
me@307 378
me@307 379 In both cases we use exactly the list used in alignment, thus, if new
me@307 380 sequences or columns are added to alignment, the block tracks this too.
me@261 381 """
me@307 382 if sequences is None:
me@307 383 sequences = alignment.sequences
me@318 384 if columns is None:
me@307 385 columns = alignment.columns
me@320 386 block = cls()
me@320 387 block.alignment = alignment
me@320 388 block.sequences = sequences
me@320 389 block.columns = columns
me@320 390 return block
me@270 391
me@312 392 def flush_left(self):
me@312 393 """Move all monomers to the left, gaps to the right within block."""
me@312 394 padding = [None] * len(self.columns)
me@312 395 for row in self.rows_as_lists():
me@312 396 sequence = row.sequence
me@312 397 row = filter(None, row) + padding
me@312 398 for monomer, column in zip(row, self.columns):
me@312 399 if monomer:
me@312 400 column[sequence] = monomer
me@312 401 elif sequence in column:
me@312 402 del column[sequence]
me@312 403
me@312 404
me@260 405 # vim: set ts=4 sts=4 sw=4 et: