rev |
line source |
me@261
|
1 import sys |
me@261
|
2 |
me@315
|
3 import util |
me@284
|
4 import fasta |
me@260
|
5 import data.codes |
me@260
|
6 |
me@306
|
7 default_gaps = set((".", "-", "~")) |
me@306
|
8 """Set of characters to recoginze as gaps when parsing alignment.""" |
me@306
|
9 |
me@328
|
10 class Monomer(object): |
me@328
|
11 """Monomer object.""" |
me@260
|
12 |
me@328
|
13 type = None |
me@328
|
14 """Either of 'dna', 'rna', 'protein'.""" |
me@260
|
15 |
me@260
|
16 by_code1 = {} |
me@328
|
17 """A mapping from 1-letter code to Monomer subclass.""" |
me@328
|
18 |
me@260
|
19 by_code3 = {} |
me@328
|
20 """A mapping from 3-letter code to Monomer subclass.""" |
me@328
|
21 |
me@260
|
22 by_name = {} |
me@328
|
23 """A mapping from full monomer name to Monomer subclass.""" |
me@260
|
24 |
me@260
|
25 @classmethod |
me@328
|
26 def _subclass(cls, name='', code1='', code3='', is_modified=False): |
me@328
|
27 """Create new subclass of Monomer for given monomer type.""" |
me@328
|
28 class TheMonomer(cls): |
me@328
|
29 pass |
me@328
|
30 name = name.strip().capitalize() |
me@328
|
31 code1 = code1.upper() |
me@328
|
32 code3 = code3.upper() |
me@328
|
33 TheMonomer.__name__ = "Monomer" |
me@328
|
34 TheMonomer.name = name |
me@328
|
35 TheMonomer.code1 = code1 |
me@328
|
36 TheMonomer.code3 = code3 |
me@328
|
37 TheMonomer.is_modified = is_modified |
me@328
|
38 if not is_modified: |
me@328
|
39 cls.by_code1[code1] = TheMonomer |
me@328
|
40 cls.by_code3[code3] = TheMonomer |
me@328
|
41 cls.by_name[name] = TheMonomer |
me@328
|
42 # We duplicate distinguished long names into Monomer itself, so that we |
me@328
|
43 # can use Monomer.from_code3 to create the relevant type of monomer. |
me@328
|
44 Monomer.by_code3[code3] = TheMonomer |
me@328
|
45 Monomer.by_name[name] = TheMonomer |
me@260
|
46 |
me@328
|
47 @classmethod |
me@328
|
48 def _initialize(cls, codes=data.codes.codes): |
me@328
|
49 """Create all relevant subclasses of Monomer.""" |
me@328
|
50 # NB. The table uses letters d, r, p for types, |
me@328
|
51 # while we use full words; hence, we compare by first letter |
me@260
|
52 for type, code1, is_modified, code3, name in codes: |
me@328
|
53 if type[0] == cls.type[0]: |
me@328
|
54 cls._subclass(name, code1, code3, is_modified) |
me@260
|
55 |
me@260
|
56 @classmethod |
me@260
|
57 def from_code1(cls, code1): |
me@328
|
58 """Create new monomer from 1-letter code.""" |
me@328
|
59 return cls.by_code1[code1.upper()]() |
me@260
|
60 |
me@260
|
61 @classmethod |
me@260
|
62 def from_code3(cls, code3): |
me@328
|
63 """Create new monomer from 3-letter code.""" |
me@328
|
64 return cls.by_code3[code3.upper()]() |
me@260
|
65 |
me@260
|
66 @classmethod |
me@260
|
67 def from_name(cls, name): |
me@328
|
68 """Create new monomer from full name.""" |
me@328
|
69 return cls.by_name[name.strip().capitalize()]() |
me@260
|
70 |
me@260
|
71 def __eq__(self, other): |
me@328
|
72 """Monomers within same monomer type are compared by code1.""" |
me@328
|
73 assert self.type == other.type |
me@328
|
74 return self.code1 == other.code1 |
bnagaev@239
|
75 |
bnagaev@239
|
76 class Sequence(list): |
me@274
|
77 """Sequence of Monomers. |
bnagaev@243
|
78 |
me@274
|
79 This behaves like list of monomer objects. In addition to standard list |
me@274
|
80 behaviour, Sequence has the following attributes: |
me@270
|
81 |
me@274
|
82 * name -- str with the name of the sequence |
me@274
|
83 * description -- str with description of the sequence |
me@274
|
84 * source -- str denoting source of the sequence |
me@266
|
85 |
me@274
|
86 Any of them may be empty (i.e. hold empty string) |
me@275
|
87 |
me@275
|
88 Class attributes: |
me@282
|
89 |
me@275
|
90 * monomer_type -- type of monomers in sequence, must be redefined when |
me@275
|
91 subclassing |
me@274
|
92 """ |
me@270
|
93 |
me@275
|
94 monomer_type = Monomer |
me@270
|
95 |
me@275
|
96 name = '' |
me@275
|
97 description = '' |
me@275
|
98 source = '' |
me@275
|
99 |
me@275
|
100 def __init__(self, sequence=[], name=None, description=None, source=None): |
me@275
|
101 super(Sequence, self).__init__(sequence) |
me@275
|
102 if hasattr(sequence, 'name'): |
me@275
|
103 vars(self).update(vars(sequence)) |
me@275
|
104 if name: |
me@275
|
105 self.name = name |
me@275
|
106 if description: |
me@275
|
107 self.description = description |
me@275
|
108 if source: |
me@275
|
109 self.source = source |
me@270
|
110 |
me@262
|
111 def __str__(self): |
me@275
|
112 """Returns sequence in one-letter code.""" |
me@275
|
113 return ''.join(monomer.code1 for monomer in self) |
me@270
|
114 |
me@316
|
115 def __hash__(self): |
me@316
|
116 """Hash sequence by identity.""" |
me@316
|
117 return id(self) |
me@316
|
118 |
me@273
|
119 @classmethod |
me@321
|
120 def from_string(cls, string, name='', description='', source=''): |
me@273
|
121 """Create sequences from string of one-letter codes.""" |
me@273
|
122 monomer = cls.monomer_type.from_code1 |
me@273
|
123 monomers = [monomer(letter) for letter in string] |
me@321
|
124 return cls(monomers, name, description, source) |
me@262
|
125 |
me@284
|
126 @classmethod |
me@284
|
127 def from_fasta(cls, file): |
me@284
|
128 """Read sequence from FASTA file. |
me@286
|
129 |
me@284
|
130 File must contain exactly one sequence. |
me@284
|
131 """ |
me@313
|
132 sequence, = fasta.parse_file(file) |
me@313
|
133 name, description, body = sequence |
me@321
|
134 return cls.from_string(body, name, description, file.name) |
me@284
|
135 |
me@295
|
136 class Alignment(object): |
me@295
|
137 """Alignment. It is a list of Columns.""" |
bnagaev@249
|
138 |
me@287
|
139 sequence_type = Sequence |
me@289
|
140 """Type of sequences in alignment. SHOULD be redefined when subclassing.""" |
me@288
|
141 |
me@289
|
142 sequences = None |
me@289
|
143 """Ordered list of sequences in alignment. Read, but DO NOT FIDDLE!""" |
bnagaev@249
|
144 |
me@287
|
145 def __init__(self): |
me@287
|
146 """Initialize empty alignment.""" |
me@287
|
147 super(Alignment, self).__init__() |
me@287
|
148 self.sequences = [] |
me@295
|
149 self.columns = [] |
me@282
|
150 |
me@299
|
151 # Alignment modification methods |
me@299
|
152 # ============================== |
me@299
|
153 |
me@294
|
154 def append_sequence(self, sequence): |
me@294
|
155 """Add sequence to alignment. |
me@294
|
156 |
me@294
|
157 If sequence is too short, pad it with gaps on the right. |
me@294
|
158 """ |
me@294
|
159 self.sequences.append(sequence) |
me@294
|
160 for i, monomer in enumerate(sequence): |
me@302
|
161 self.column_at(i)[sequence] = monomer |
me@294
|
162 |
me@306
|
163 def append_gapped_line(self, line, name='', description='', source='', |
me@306
|
164 gaps=default_gaps): |
me@287
|
165 """Add row from a line of one-letter codes and gaps.""" |
me@313
|
166 Sequence = self.sequence_type |
me@306
|
167 not_gap = lambda (i, char): char not in gaps |
me@306
|
168 without_gaps = util.remove_each(line, gaps) |
me@321
|
169 sequence = Sequence.from_string(without_gaps, name, description, source) |
me@303
|
170 # The following line has some simple magic: |
me@303
|
171 # 1. attach natural numbers to monomers |
me@303
|
172 # 2. delete gaps |
me@303
|
173 # 3. attach numbers again |
me@303
|
174 # This way we have a pair of numbers attached to monomer: |
me@303
|
175 # - it's position in alignment (the first attached number, j) |
me@303
|
176 # - it's position in sequence (the second attached number, i) |
me@287
|
177 for i, (j, char) in enumerate(filter(not_gap, enumerate(line))): |
me@313
|
178 self.column_at(j)[sequence] = sequence[i] |
me@287
|
179 self.sequences.append(sequence) |
me@287
|
180 |
me@302
|
181 def column_at(self, n): |
me@302
|
182 """Return column by index. Create required new columns if required. |
me@302
|
183 |
me@302
|
184 Do NOT use this method, unless you are sure it is what you want. |
me@302
|
185 """ |
me@302
|
186 for i in range(len(self.columns), n + 1): |
me@302
|
187 self.columns.append(Column()) |
me@302
|
188 return self.columns[n] |
me@302
|
189 |
me@299
|
190 # Alignment IO methods |
me@299
|
191 # ==================== |
me@299
|
192 |
me@287
|
193 @classmethod |
me@306
|
194 def from_fasta(cls, file, gaps=default_gaps): |
me@287
|
195 """Create new alignment from FASTA file.""" |
me@287
|
196 self = cls() |
me@313
|
197 for (name, description, body) in fasta.parse_file(file): |
me@321
|
198 self.append_gapped_line(body, name, description, file.name, gaps) |
me@287
|
199 return self |
bnagaev@249
|
200 |
me@292
|
201 def to_fasta(self, file): |
me@292
|
202 """Write alignment in FASTA file as sequences with gaps.""" |
me@292
|
203 def char(monomer): |
me@292
|
204 if monomer: |
me@292
|
205 return monomer.code1 |
me@292
|
206 return "-" |
me@292
|
207 for row in self.rows_as_lists(): |
me@292
|
208 seq = row.sequence |
me@292
|
209 line = "".join(map(char, row)) |
me@292
|
210 fasta.save_file(file, line, seq.name, seq.description) |
me@292
|
211 |
me@299
|
212 # Data access methods for alignment |
me@299
|
213 # ================================= |
me@299
|
214 |
me@299
|
215 def rows(self): |
me@299
|
216 """Return list of rows (temporary objects) in alignment. |
me@299
|
217 |
me@299
|
218 Each row is a dictionary of { column : monomer }. |
me@299
|
219 |
me@299
|
220 For gap positions there is no key for the column in row. |
me@299
|
221 |
me@299
|
222 Each row has attribute `sequence` pointing to the sequence the row is |
me@299
|
223 describing. |
me@299
|
224 |
me@299
|
225 Modifications of row have no effect on the alignment. |
me@299
|
226 """ |
me@299
|
227 # For now, the function returns a list rather than iterator. |
me@299
|
228 # It is yet to see, whether memory performance here becomes critical, |
me@299
|
229 # or is random access useful. |
me@299
|
230 rows = [] |
me@299
|
231 for sequence in self.sequences: |
me@299
|
232 row = util.UserDict() |
me@299
|
233 row.sequence = sequence |
me@299
|
234 for column in self.columns: |
me@299
|
235 if sequence in column: |
me@299
|
236 row[column] = column[sequence] |
me@299
|
237 rows.append(row) |
me@299
|
238 return rows |
me@299
|
239 |
me@299
|
240 def rows_as_lists(self): |
me@299
|
241 """Return list of rows (temporary objects) in alignment. |
me@299
|
242 |
me@299
|
243 Each row here is a list of either monomer or None (for gaps). |
me@299
|
244 |
me@299
|
245 Each row has attribute `sequence` pointing to the sequence of row. |
me@299
|
246 |
me@299
|
247 Modifications of row have no effect on the alignment. |
me@299
|
248 """ |
me@299
|
249 rows = [] |
me@299
|
250 for sequence in self.sequences: |
me@299
|
251 row = util.UserList() |
me@299
|
252 row.sequence = sequence |
me@299
|
253 for column in self.columns: |
me@299
|
254 row.append(column.get(sequence)) |
me@299
|
255 rows.append(row) |
me@299
|
256 return rows |
me@299
|
257 |
me@299
|
258 def columns_as_lists(self): |
me@299
|
259 """Return list of columns (temorary objects) in alignment. |
me@299
|
260 |
me@299
|
261 Each column here is a list of either monomer or None (for gaps). |
me@299
|
262 |
me@299
|
263 Items of column are sorted in the same way as alignment.sequences. |
me@299
|
264 |
me@299
|
265 Modifications of column have no effect on the alignment. |
me@299
|
266 """ |
me@299
|
267 columns = [] |
me@299
|
268 for column in self.columns: |
me@299
|
269 col = [] |
me@299
|
270 for sequence in self.sequences: |
me@299
|
271 col.append(column.get(sequence)) |
me@299
|
272 columns.append(col) |
me@299
|
273 return columns |
me@299
|
274 |
me@300
|
275 class Column(dict): |
me@300
|
276 """Column of alignment. |
me@300
|
277 |
me@300
|
278 Column is a dict of { sequence : monomer }. |
me@300
|
279 |
me@300
|
280 For sequences that have gaps in current row, given key is not present in |
me@300
|
281 the column. |
me@300
|
282 """ |
me@325
|
283 |
me@325
|
284 def __hash__(self): |
me@325
|
285 """Return hash by identity.""" |
me@325
|
286 return id(self) |
me@300
|
287 |
me@317
|
288 class Block(Alignment): |
me@307
|
289 """Block of alignment. |
me@301
|
290 |
me@307
|
291 Block is intersection of a set of columns & a set of rows. Most of blocks |
me@307
|
292 look like rectangular part of alignment if you shuffle alignment rows the |
me@307
|
293 right way. |
me@261
|
294 """ |
me@270
|
295 |
me@307
|
296 alignment = None |
me@307
|
297 """Alignment the block belongs to.""" |
me@270
|
298 |
me@307
|
299 sequences = () |
me@307
|
300 """List of sequences in block.""" |
me@307
|
301 |
me@307
|
302 columns = () |
me@307
|
303 """List of columns in block.""" |
me@307
|
304 |
me@317
|
305 @classmethod |
me@317
|
306 def from_alignment(cls, alignment, sequences=None, columns=None): |
me@307
|
307 """Build new block from alignment. |
me@307
|
308 |
me@307
|
309 If sequences are not given, the block uses all sequences in alignment. |
me@307
|
310 |
me@307
|
311 If columns are not given, the block uses all columns in alignment. |
me@307
|
312 |
me@307
|
313 In both cases we use exactly the list used in alignment, thus, if new |
me@307
|
314 sequences or columns are added to alignment, the block tracks this too. |
me@261
|
315 """ |
me@307
|
316 if sequences is None: |
me@307
|
317 sequences = alignment.sequences |
me@318
|
318 if columns is None: |
me@307
|
319 columns = alignment.columns |
me@320
|
320 block = cls() |
me@320
|
321 block.alignment = alignment |
me@320
|
322 block.sequences = sequences |
me@320
|
323 block.columns = columns |
me@320
|
324 return block |
me@270
|
325 |
me@312
|
326 def flush_left(self): |
me@312
|
327 """Move all monomers to the left, gaps to the right within block.""" |
me@312
|
328 padding = [None] * len(self.columns) |
me@312
|
329 for row in self.rows_as_lists(): |
me@312
|
330 sequence = row.sequence |
me@312
|
331 row = filter(None, row) + padding |
me@312
|
332 for monomer, column in zip(row, self.columns): |
me@312
|
333 if monomer: |
me@312
|
334 column[sequence] = monomer |
me@312
|
335 elif sequence in column: |
me@312
|
336 del column[sequence] |
me@312
|
337 |
me@312
|
338 |
me@260
|
339 # vim: set ts=4 sts=4 sw=4 et: |