Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/c0689204093e/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:23:56 2013
Кодировка:
allpy: c0689204093e lib/project.py

allpy

view lib/project.py @ 91:c0689204093e

from_fasta with new monomer class and monomer_type
author boris <bnagaev@gmail.com>
date Wed, 20 Oct 2010 10:05:43 +0400
parents ea3113da42ca
children 581c1f71c4ea
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 import monomer
12 from monomer import MonomerType
13 import allpy_data
15 class Project(object):
16 """
17 Mandatory data:
18 * sequences -- list of Sequence objects. Sequences don't contain gaps
19 - see sequence.py module
20 * alignment -- dict
21 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
22 keys are the Sequence objects, values are the lists, which
23 contain monomers of those sequences or None for gaps in the
24 corresponding sequence of
25 alignment
27 """
28 def __init__(self, *args):
29 """overloaded constructor
31 Project() -> new empty Project
32 Project(sequences, alignment) -> new Project with sequences and
33 alignment initialized from arguments
34 Project(fasta_file) -> new Project, read alignment and sequences
35 from fasta file
37 """
38 if len(args)>1:#overloaded constructor
39 self.sequences=args[0]
40 self.alignment=args[1]
41 elif len(args)==0:
42 self.sequences=[]
43 self.alignment={}
44 else:
45 self.sequences,self.alignment=Project.from_fasta(args[0])
47 def __len__(self):
48 return max([len(line) for line in self.alignment.values()])
50 def thickness(self):
51 """ The number of sequences in alignment (it's thickness).
52 """
53 return len(self.alignment)
55 def calc_identity(self):
56 """ Calculate the identity of alignment positions for colouring.
58 For every (row, column) in alignment the percentage of the exactly
59 same residue in the same column in the alignment is calculated.
60 The data structure is just like the Project.alignment, but istead of
61 monomers it contains float percentages.
62 """
63 # Oh, God, that's awful! Absolutely not understandable.
64 # First, calculate percentages of amino acids in every column
65 contribution = 1.0 / len(self.sequences)
66 all_columns = []
67 for position in range(len(self)):
68 column_percentage = {}
69 for seq in self.alignment:
70 if self.alignment[seq][position] is not None:
71 aa = self.alignment[seq][position].code
72 else:
73 aa = None
74 if aa in allpy_data.amino_acids:
75 if aa in column_percentage.keys():
76 column_percentage[aa] += contribution
77 else:
78 column_percentage[aa] = contribution
79 all_columns.append(column_percentage)
80 # Second, map these percentages onto the alignment
81 self.identity_percentages = {}
82 for seq in self.sequences:
83 self.identity_percentages[seq] = []
84 for seq in self.identity_percentages:
85 line = self.identity_percentages[seq]
86 for position in range(len(self)):
87 if self.alignment[seq][position] is not None:
88 aa = self.alignment[seq][position].code
89 else:
90 aa = None
91 line.append(all_columns[position].get(aa))
92 return self.identity_percentages
94 @staticmethod
95 def from_fasta(file):
96 """
97 >>> import project
98 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))
99 """
100 import re
102 sequences=[]
103 alignment={}
105 content=file.read()
106 raw_sequences=content.split(">")[1:]#ignore everything before the first >
107 for raw in raw_sequences:
108 parsed_raw_sequence = raw.split("\n")
109 for counter,piece in enumerate(parsed_raw_sequence):
110 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
111 name_and_description = parsed_raw_sequence[0]
112 if len(name_and_description.split(" ",1))==2:
113 name,description=name_and_description.split(" ",1)
114 elif len(name_and_description.split(" ",1))==1:#if there is description
115 name=name_and_description
116 else:
117 raise "Wrong name of sequence in fasta file"
118 string=""
119 for piece in parsed_raw_sequence[1:]:
120 piece_without_whitespace_chars=re.sub("\s","",piece)
121 string+=piece_without_whitespace_chars
122 monomers=[]#convert into Monomer objects
123 alignment_list=[]#create the respective list in alignment dict
124 for current_monomer in string:
125 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
126 monomer_type = MonomerType.from_code1(current_monomer)
127 monomers.append(monomer.Monomer(monomer_type))
128 alignment_list.append(monomers[-1])
129 else:
130 alignment_list.append(None)
131 if "description" in vars():#if there's no description
132 sequences.append(sequence.Sequence(name,monomers,description))
133 else:
134 sequences.append(sequence.Sequence(name,monomers))
135 alignment[sequences[-1]]=alignment_list
136 return sequences,alignment