Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/file/76816df24fe2/lib/project.py
Дата изменения: Unknown
Дата индексирования: Sun Feb 3 23:28:35 2013
Кодировка:
allpy: 76816df24fe2 lib/project.py

allpy

view lib/project.py @ 5:76816df24fe2

dummy_pytale.py: file added Can do: display an alignment, colour by identity. Features: Both sequence names and sequences are TextCtrl. Extremely slow.
author grishin@gorilla
date Wed, 09 Jun 2010 14:46:02 +0400
parents 4ed6440f4f70
children a185e7e255b3
line source
1 #!/usr/bin/python
3 """
4 "I will not use abbrev."
5 "I will always finish what I st"
6 - Bart Simpson
8 """
10 import sequence
11 import monomer
12 import allpy_data
14 class Project(object):
15 """
16 Mandatory data:
17 * sequences -- list of Sequence objects. Sequences don't contain gaps
18 - see sequence.py module
19 * alignment -- dict
20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}
21 keys are the Sequence objects, values are the lists, which
22 contain monomers of those sequences or None for gaps in the
23 corresponding sequence of
24 alignment
26 """
27 def __init__(self, *args):
28 """overloaded constructor
30 Project() -> new empty Project
31 Project(sequences, alignment) -> new Project with sequences and
32 alignment initialized from arguments
33 Project(fasta_file) -> new Project, read alignment and sequences
34 from fasta file
36 """
37 if len(args)>1:#overloaded constructor
38 self.sequences=args[0]
39 self.alignment=args[1]
40 elif len(args)==0:
41 self.sequences=[]
42 self.alignment={}
43 else:
44 self.sequences,self.alignment=Project.get_from_fasta(args[0])
46 def __len__(self):
47 return max([len(line) for line in self.alignment.values()])
49 def calc_identity(self):
50 """ Calculate the identity of alignment positions for colouring.
52 For every (row, column) in alignment the percentage of the exactly
53 same residue in the same column in the alignment is calculated.
54 The data structure is just like the Project.alignment, but istead of
55 monomers it contains float percentages.
56 """
57 # Oh, God, that's awful! Absolutely not understandable.
58 # First, calculate percentages of amino acids in every column
59 contribution = 1.0 / len(self.sequences)
60 all_columns = []
61 for position in range(len(self)):
62 column_percentage = {}
63 for seq in self.alignment:
64 if self.alignment[seq][position] is not None:
65 aa = self.alignment[seq][position].code
66 else:
67 aa = None
68 if aa in allpy_data.amino_acids:
69 if aa in column_percentage.keys():
70 column_percentage[aa] += contribution
71 else:
72 column_percentage[aa] = contribution
73 all_columns.append(column_percentage)
74 # Second, map these percentages onto the alignment
75 self.identity_percentages = {}
76 for seq in self.sequences:
77 self.identity_percentages[seq] = []
78 for seq in self.identity_percentages:
79 line = self.identity_percentages[seq]
80 for position in range(len(self)):
81 if self.alignment[seq][position] is not None:
82 aa = self.alignment[seq][position].code
83 else:
84 aa = None
85 line.append(all_columns[position].get(aa))
86 return self.identity_percentages
88 @staticmethod
89 def get_from_fasta(file):
90 """
91 >>> import project
92 >>> sequences,alignment=project.Project.get_from_fasta(open("test.fasta"))
93 """
94 import re
96 sequences=[]
97 alignment={}
99 content=file.read()
100 raw_sequences=content.split(">")[1:]#ignore everything before the first >
101 for raw in raw_sequences:
102 parsed_raw_sequence = raw.split("\n")
103 for counter,piece in enumerate(parsed_raw_sequence):
104 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
105 name_and_description = parsed_raw_sequence[0]
106 if len(name_and_description.split(" ",1))==2:
107 name,description=name_and_description.split(" ",1)
108 elif len(name_and_description.split(" ",1))==1:#if there is description
109 name=name_and_description
110 else:
111 raise "Wrong name of sequence in fasta file"
112 string=""
113 for piece in parsed_raw_sequence[1:]:
114 piece_without_whitespace_chars=re.sub("\s","",piece)
115 string+=piece_without_whitespace_chars
116 monomers=[]#convert into Monomer objects
117 alignment_list=[]#create the respective list in alignment dict
118 for current_monomer in string:
119 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
120 monomers.append(monomer.Monomer(current_monomer))
121 alignment_list.append(monomers[-1])
122 else:
123 alignment_list.append(None)
124 if "description" in vars():#if there's no description
125 sequences.append(sequence.Sequence(name,description,monomers))
126 else:
127 sequences.append(sequence.Sequence(name,None,monomers))
128 alignment[sequences[-1]]=alignment_list
129 return sequences,alignment