Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/3d40d7b9a123/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:23:47 2013
Кодировка:
allpy: 3d40d7b9a123 lib/project.py

allpy

view lib/project.py @ 98:3d40d7b9a123

lib::project::muscle_align project::save_fasta sequence::operator== fixes test -- alignment done
author boris <bnagaev@gmail.com>
date Wed, 20 Oct 2010 23:33:29 +0400
parents 7dd461d17f96
children ffc102ed0249
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 from monomer import AminoAcidType
12 import allpy_data
13 import os
14 from tempfile import mkstemp
16 class Project(object):
17 """
18 Mandatory data:
19 * sequences -- list of Sequence objects. Sequences don't contain gaps
20 - see sequence.py module
21 * alignment -- dict
22 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
23 keys are the Sequence objects, values are the lists, which
24 contain monomers of those sequences or None for gaps in the
25 corresponding sequence of
26 alignment
28 """
29 def __init__(self, *args):
30 """overloaded constructor
32 Project() -> new empty Project
33 Project(sequences, alignment) -> new Project with sequences and
34 alignment initialized from arguments
35 Project(fasta_file) -> new Project, read alignment and sequences
36 from fasta file
38 """
39 if len(args)>1:#overloaded constructor
40 self.sequences=args[0]
41 self.alignment=args[1]
42 elif len(args)==0:
43 self.sequences=[]
44 self.alignment={}
45 else:
46 self.sequences,self.alignment=Project.from_fasta(args[0])
48 def __len__(self):
49 return max([len(line) for line in self.alignment.values()])
51 def thickness(self):
52 """ The number of sequences in alignment (it's thickness).
53 """
54 return len(self.alignment)
56 def calc_identity(self):
57 """ Calculate the identity of alignment positions for colouring.
59 For every (row, column) in alignment the percentage of the exactly
60 same residue in the same column in the alignment is calculated.
61 The data structure is just like the Project.alignment, but istead of
62 monomers it contains float percentages.
63 """
64 # Oh, God, that's awful! Absolutely not understandable.
65 # First, calculate percentages of amino acids in every column
66 contribution = 1.0 / len(self.sequences)
67 all_columns = []
68 for position in range(len(self)):
69 column_percentage = {}
70 for seq in self.alignment:
71 if self.alignment[seq][position] is not None:
72 aa = self.alignment[seq][position].code
73 else:
74 aa = None
75 if aa in allpy_data.amino_acids:
76 if aa in column_percentage.keys():
77 column_percentage[aa] += contribution
78 else:
79 column_percentage[aa] = contribution
80 all_columns.append(column_percentage)
81 # Second, map these percentages onto the alignment
82 self.identity_percentages = {}
83 for seq in self.sequences:
84 self.identity_percentages[seq] = []
85 for seq in self.identity_percentages:
86 line = self.identity_percentages[seq]
87 for position in range(len(self)):
88 if self.alignment[seq][position] is not None:
89 aa = self.alignment[seq][position].code
90 else:
91 aa = None
92 line.append(all_columns[position].get(aa))
93 return self.identity_percentages
95 @staticmethod
96 def from_fasta(file, monomer_type=AminoAcidType):
97 """
98 >>> import project
99 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))
100 """
101 import re
103 sequences=[]
104 alignment={}
106 content=file.read()
107 raw_sequences=content.split(">")[1:]#ignore everything before the first >
108 for raw in raw_sequences:
109 parsed_raw_sequence = raw.split("\n")
110 for counter,piece in enumerate(parsed_raw_sequence):
111 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
112 name_and_description = parsed_raw_sequence[0]
113 if len(name_and_description.split(" ",1))==2:
114 name, description = name_and_description.split(" ",1)
115 elif len(name_and_description.split(" ", 1)) == 1:#if there is description
116 name = name_and_description
117 else:
118 raise "Wrong name of sequence in fasta file"
119 string=""
120 for piece in parsed_raw_sequence[1:]:
121 piece_without_whitespace_chars=re.sub("\s","",piece)
122 string+=piece_without_whitespace_chars
123 monomers=[]#convert into Monomer objects
124 alignment_list=[]#create the respective list in alignment dict
125 for current_monomer in string:
126 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
127 monomers.append(monomer_type.from_code1(current_monomer).instance())
128 alignment_list.append(monomers[-1])
129 else:
130 alignment_list.append(None)
131 if "description" in vars():#if there's no description
132 sequences.append(sequence.Sequence(name,monomers,description))
133 else:
134 sequences.append(sequence.Sequence(name,monomers))
135 alignment[sequences[-1]]=alignment_list
136 return sequences,alignment
139 @staticmethod
140 def from_sequences(*sequences):
141 """
142 Constructs new alignment from sequences
143 Add gaps to right end to make equal lengthes of alignment sequences
144 """
145 project = Project()
146 project.sequences = sequences
147 max_length = max(len(sequence) for sequence in sequences)
148 for sequence in sequences:
149 gaps_count = max_length - len(sequence)
150 project.alignment[sequence] = sequence.monomers + [None] * gaps_count
151 return project
153 def save_fasta(self, out_file, long_line=60):
154 """
155 Saves alignment to given file
156 Splits long lines to substrings of length=long_line
157 To prevent this, set long_line=None
158 """
159 for sequence in self.sequences:
160 out_file.write(">%(name)s %(description)s \n" % sequence.__dict__)
161 string = str(sequence)
162 if long_line:
163 for i in range(0, len(string) // long_line + 1):
164 out_file.write("%s \n" % string[i*long_line : i*long_line + long_line])
165 else:
166 out_file.write("%s \n" % string)
168 def muscle_align(self):
169 """
170 Simple align ths alignment using sequences (muscle)
171 uses old Monomers and Sequences objects
172 """
173 tmp_file, tmp_filename = mkstemp()
174 os.close(tmp_file) # this is file descriptor, not normal file object. security issue )
175 tmp_file = open(tmp_filename, 'w')
176 self.save_fasta(tmp_file)
177 tmp_file.close()
178 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_filename})
179 sequences, alignment = Project.from_fasta(open(tmp_filename))
180 for sequence in self.sequences:
181 try:
182 new_sequence = [i for i in sequences if sequence==i][0]
183 except:
184 raise Exception("Align: Cann't find sequence %s in muscle output" % sequence.name)
185 old_monomers = sequence.monomers.__iter__()
186 self.alignment[sequence] = []
187 for monomer in alignment[new_sequence]:
188 if not monomer:
189 self.alignment[sequence].append(monomer)
190 else:
191 self.alignment[sequence].append(old_monomers.next())