allpy
changeset 647:b35116e13f35
Changed the __repr__for sequence to <Sequence code1>. Renamed row() to row_as_list().
author | Boris Burkov <BurkovBA@gmail.com> |
---|---|
date | Wed, 08 Jun 2011 15:15:57 +0400 |
parents | 612c618fb7b0 |
children | f4f658d86ca1 |
files | allpy/base.py sequence_based_blocks_search/blocks_finder.py sequence_based_blocks_search/functional_groups.py |
diffstat | 3 files changed, 325 insertions(+), 4 deletions(-) [+] |
line diff
1.1 --- a/allpy/base.py Tue Jun 07 17:37:06 2011 +0400 1.2 +++ b/allpy/base.py Wed Jun 08 15:15:57 2011 +0400 1.3 @@ -82,7 +82,7 @@ 1.4 return cls.by_name[name.strip().capitalize()]() 1.5 1.6 def __repr__(self): 1.7 - return str(self.code1) 1.8 + return "<Monomer %s>" % str(self.code1) 1.9 1.10 def __str__(self): 1.11 """Returns one-letter code""" 1.12 @@ -139,7 +139,7 @@ 1.13 1.14 def __repr__(self): 1.15 if self.name: 1.16 - return str(self.name) 1.17 + return '<Sequence %s>' % str(self.name) 1.18 else: 1.19 return '<Sequence %s>' % str(self) 1.20 1.21 @@ -231,7 +231,7 @@ 1.22 # Data access methods for alignment 1.23 # ================================= 1.24 1.25 - def row(self, sequence): 1.26 + def row_as_list(self, sequence): 1.27 """Creates and returns temporary list of monomers and Nones""" 1.28 output=[] 1.29 for column in self.columns: 1.30 @@ -247,7 +247,7 @@ 1.31 if monomer: 1.32 return monomer.code1 1.33 return "-" 1.34 - row = self.row(sequence) 1.35 + row = self.row_as_list(sequence) 1.36 list_of_letters = map(char, row) 1.37 output="" 1.38 for letter in list_of_letters: output+=letter
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/sequence_based_blocks_search/blocks_finder.py Wed Jun 08 15:15:57 2011 +0400 2.3 @@ -0,0 +1,262 @@ 2.4 +#!/usr/bin/python 2.5 + 2.6 +import sys 2.7 +sys.path.append("../") 2.8 +from allpy import protein 2.9 +import copy 2.10 +import math 2.11 +from functional_groups import functional_groups 2.12 + 2.13 +class Preblock(object): 2.14 + """Attributes: 2.15 + first_column_index=self.alignment.columns[column1] 2.16 + last_column_index=self.aligmnent.columns[column2] 2.17 + sequences=[sequence1, sequence2...] 2.18 + alignment 2.19 + 2.20 + A preblock score is calculated as follows: 2.21 + Suppose, that a preblock contains 2 conserved and 3 inconserved positions. 2.22 + Preblock score is sum of scores of its conserved and inconserved columns: 2.23 + Score(conserved column) = log(p(group|70% conservation)) - n*log(f(group)) 2.24 + Score(inconserved column) = log(0.3) - log(1) 2.25 + Thus score of preblock contains 3 terms: sum of scores of inconserved column 2.26 + sum of log(p(group|70% conservation)) and sum of log(f(group)). 2.27 + """ 2.28 + def __init__(self, first_column_index, last_column_index, sequences, term1, term2, term3, alignment): 2.29 + self.first_column_index = first_column_index 2.30 + self.last_column_index = last_column_index 2.31 + self.sequences = sequences 2.32 + self.term1 = term1 2.33 + self.term2 = term2 2.34 + self.term3 = term3 2.35 + self.alignment = alignment 2.36 + 2.37 + def calculate_score(self): 2.38 + return self.term1 - len(self.sequences)*self.term2 + self.term3 2.39 + 2.40 + def __repr__(self): 2.41 + return "preblock:%s, %s, %s, %s"%(self.first_column_index, self.last_column_index, self.calculate_score(), self.sequences) 2.42 + 2.43 + 2.44 +class BreakCycleException(Exception): 2.45 + pass 2.46 + 2.47 + 2.48 +def main(file): 2.49 + alignment = protein.Alignment().append_file(file) 2.50 + groups_content = find_groups_content(alignment, functional_groups) 2.51 + find_links(groups_content, alignment) 2.52 + blocks = mark_blocks(alignment) 2.53 + delete_links(alignment) 2.54 + return blocks 2.55 + 2.56 + 2.57 +def find_groups_content(alignment, functional_groups): 2.58 + groups_content = {} 2.59 + for column in alignment.columns: 2.60 + groups2sequences = find_sequences_that_belong_to_groups_in_column(column, functional_groups) 2.61 + groups_content[column] = groups2sequences 2.62 + return groups_content 2.63 + 2.64 + 2.65 +def find_sequences_that_belong_to_groups_in_column(column, functional_groups): 2.66 + output = {} 2.67 + for group_name in functional_groups.keys(): 2.68 + output[group_name] = [] 2.69 + for sequence in column.keys(): 2.70 + if column[sequence].code1 in functional_groups[group_name]["aminoacids"]: 2.71 + output[group_name].append(sequence) 2.72 + return output 2.73 + 2.74 + 2.75 +def find_links(groups_content, alignment): 2.76 + candidate_preblocks = [] 2.77 + for index_of_column, column in enumerate(alignment.columns): 2.78 + candidate_preblocks, addition_to_candidate_preblocks = split_preblocks_by_presense_of_gaps(candidate_preblocks, column) 2.79 + #create new preblocks based on the groups 2.80 + try: 2.81 + for group_name in functional_groups: 2.82 + if len(groups_content[column][group_name])>1: 2.83 + addition_to_candidate_preblocks.append(Preblock(index_of_column, index_of_column, copy.copy(groups_content[column][group_name]), functional_groups[group_name]["70%"], functional_groups[group_name]["100%"], 0.0, alignment)) 2.84 + except KeyError as ke: 2.85 + print "Found non-canonical aminoacid: %s"%(ke) 2.86 + #create new preblocks as combinations of the old ones 2.87 + try: 2.88 + for group_name in functional_groups: 2.89 + for preblock in candidate_preblocks: 2.90 + if len([sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences]) > 1: 2.91 + addition_to_candidate_preblocks.append(Preblock(preblock.first_column_index, index_of_column, [sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences], preblock.term1+functional_groups[group_name]["70%"], preblock.term2+functional_groups[group_name]["100%"], preblock.term3, alignment)) 2.92 + except KeyError as ke: 2.93 + print "Found non-canonical aminoacid: %s"%(ke) 2.94 + #add one inconserved column to the old ones, merge all together 2.95 + for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1) 2.96 + candidate_preblocks+=addition_to_candidate_preblocks 2.97 + #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have smaller score 2.98 + print index_of_column, len(candidate_preblocks), 2.99 + remove_contained_preblocks(candidate_preblocks) 2.100 + print len(candidate_preblocks) 2.101 + #for those preblocks ending with a conserved position and having the score above threshold, add new links 2.102 + for preblock in candidate_preblocks: 2.103 + if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups): 2.104 + create_links(preblock, alignment) #links are not cliques, just connected components 2.105 + 2.106 + 2.107 +def split_preblocks_by_presense_of_gaps(preblocks, column): 2.108 + """Each preblock is split into up to two preblocks vertically: 2.109 + those, with sequences, that contain gaps and those, that 2.110 + don't. 2.111 + """ 2.112 + preblocks_without_gap = [] 2.113 + preblocks_with_gap = [] 2.114 + for preblock in preblocks: 2.115 + preblock_sequences_without_gap = [] 2.116 + preblock_sequences_with_gap = [] 2.117 + for sequence in preblock.sequences: 2.118 + if sequence in column.keys(): 2.119 + preblock_sequences_without_gap.append(sequence) 2.120 + else: 2.121 + preblock_sequences_with_gap.append(sequence) 2.122 + if len(preblock_sequences_without_gap)>=2: 2.123 + preblocks_without_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_without_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment))#added one inconserved position 2.124 + if len(preblock_sequences_with_gap)>=2: 2.125 + preblocks_with_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_with_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment)) 2.126 + return preblocks_without_gap, preblocks_with_gap 2.127 + 2.128 + 2.129 +def remove_contained_preblocks(preblocks): 2.130 + """removes blocks that have rivals with same sets of sequences, 2.131 + but shorter and with inferior scores, normalized to the number 2.132 + of sequences within the block 2.133 + 2.134 + """ 2.135 + output=[] 2.136 + counter = len(preblocks)-1 2.137 + while counter!=-1: 2.138 + for preblock in preblocks: 2.139 + try: 2.140 + if preblock is not preblocks[counter]: 2.141 + for sequence in preblocks[counter].sequences: 2.142 + if sequence not in preblock.sequences: raise BreakCycleException 2.143 + #if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted. 2.144 + if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\ 2.145 + and preblocks[counter].last_column_index<=preblock.last_column_index: 2.146 + preblocks.remove(preblocks[counter]) 2.147 + break 2.148 + #create normalized preblock with same set of sequences as the examined one 2.149 + normalized_preblock = Preblock(preblock.first_column_index, preblock.last_column_index, preblocks[counter].sequences, preblock.term1, preblock.term2, preblock.term3, preblock.alignment) 2.150 + if preblocks[counter].first_column_index>=preblock.first_column_index\ 2.151 + and preblocks[counter].last_column_index<=preblock.last_column_index\ 2.152 + and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score(): 2.153 + preblocks.remove(preblocks[counter]) 2.154 + break 2.155 + except BreakCycleException: 2.156 + pass 2.157 + counter-=1 2.158 + 2.159 + 2.160 +def calculate_threshold(number_of_sequences_in_preblock, number_of_sequences_in_alignment, functional_groups): 2.161 + """ 2.162 + We seek to find the probability, that given n randomly generated 2.163 + sequences we won't find alignment of them, such that there is a 2.164 + block of k sequences with score not less than the observed. 2.165 + 2.166 + First, we have to pick 5 sequences out of 20; 2.167 + then iterate over all the ways to shift them against each other. 2.168 + 2.169 + The number of shifts of length 1 is roughly 5*4*len(alignment)-1)^3 2.170 + The number of shifts of length 2 is roughly 5*4*len(alignment)-2)^3 2.171 + ... 2.172 + 2.173 + Considering these shifts independent, we seek to find the p-values 2.174 + for each of the scores to be not found in all the stretches. 2.175 + 2.176 + Thus, for each shift we calculate the weights of all groups and 2.177 + inconserved positions, create the transition matrix, calculate 2.178 + its eigenvectors and eigenvalues, round the weights, find coefficient 2.179 + of [1,0,0,...] decomposition in the basis of eigenvectors. 2.180 + 2.181 + 2.182 + """ 2.183 + 2.184 + #length_of_stretch = math.factorial(number_of_sequences_in_alignment)/math.factorial(number_of_sequences_in_preblock)/math.factorial(number_of_sequences_in_preblock)*^number_of_sequences_in_alignment 2.185 + return (-1)*functional_groups["u"]["70%"]*1.5*number_of_sequences_in_preblock 2.186 + 2.187 + 2.188 +def create_links(preblock, alignment): 2.189 + """links are lists of sequences""" 2.190 + for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]: 2.191 + if preblock.sequences[0] not in column: continue #ignore gap columns 2.192 + for sequence in preblock.sequences: 2.193 + if "links" not in column[sequence].__dict__: 2.194 + column[sequence].links=[] 2.195 + remaining_sequences = copy.copy(preblock.sequences) 2.196 + remaining_sequences.remove(sequence) 2.197 + for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence 2.198 + if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence) 2.199 + if "links" not in column[iterated_sequence].__dict__: 2.200 + column[iterated_sequence].links=[sequence] 2.201 + else: 2.202 + if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence) 2.203 + 2.204 + 2.205 +def mark_blocks(alignment): 2.206 + output=[] 2.207 + blocks_in_previous_column = [] 2.208 + for column in alignment.columns: 2.209 + blocks_in_this_column = [] 2.210 + remaining_sequences=copy.copy(alignment.sequences) 2.211 + while remaining_sequences !=[]: 2.212 + connected_component=find_connected_component(remaining_sequences, column) 2.213 + if connected_component==[]: break 2.214 + #if connected_component extends existing block, extend that block, else create new one 2.215 + appended_to_block_flag = False 2.216 + for block in blocks_in_previous_column: 2.217 + if set(connected_component)==set(block.sequences): 2.218 + block.columns.append(column) 2.219 + blocks_in_this_column.append(block) 2.220 + appended_to_block_flag = True 2.221 + if appended_to_block_flag==False: 2.222 + blocks_in_this_column.append(protein.Block.from_alignment(alignment, connected_component, [column])) 2.223 + for sequence in connected_component: 2.224 + remaining_sequences.remove(sequence) 2.225 + for block in blocks_in_previous_column: 2.226 + if block not in blocks_in_this_column: 2.227 + output.append(block) 2.228 + blocks_in_previous_column = blocks_in_this_column 2.229 + if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column 2.230 + return output 2.231 + 2.232 + 2.233 +def find_connected_component(sequences, column): 2.234 + #find one sequence that has blocks 2.235 + output=[] 2.236 + if sequences!=[]: 2.237 + for sequence in sequences: 2.238 + if sequence not in column: continue 2.239 + if "links" in column[sequence].__dict__: 2.240 + output.append(sequence) 2.241 + break 2.242 + if output == []: return output 2.243 + #find all sequences in the same connected component 2.244 + not_exhausted_flag=True 2.245 + sequences_copy=copy.copy(sequences) 2.246 + output_counter=0 2.247 + while not_exhausted_flag: 2.248 + not_exhausted_flag = False 2.249 + for sequence in column[output[output_counter]].links: 2.250 + if sequence not in output and sequence in sequences: 2.251 + output.append(sequence) 2.252 + output_counter+=1 2.253 + not_exhausted_flag=True 2.254 + return output 2.255 + 2.256 + 2.257 +def delete_links(alignment): 2.258 + for sequence in alignment.sequences: 2.259 + for monomer in sequence: 2.260 + if "links" in monomer.__dict__: 2.261 + del monomer.links 2.262 + 2.263 + 2.264 +if __name__== '__main__': 2.265 + main(open(sys.argv[1]))
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/sequence_based_blocks_search/functional_groups.py Wed Jun 08 15:15:57 2011 +0400 3.3 @@ -0,0 +1,59 @@ 3.4 +# group name: aminoacids in group, logprobability of group for 100% identical sequences, >60%, >30% 3.5 +functional_groups={ 3.6 + "b":{"aminoacids":"IVLMFA","100%":-1.671163536, "70%":-3, "30%":-4.0}, 3.7 + "t":{"aminoacids":"AGSTC", "100%":-1.625934282, "70%":-3, "30%":-4.0}, 3.8 + "u":{"aminoacids":"HKR", "100%":-2.805912948, "70%":-4, "30%":-4.5}, 3.9 + "l":{"aminoacids":"IVLM", "100%":-2.321928095, "70%":-4, "30%":-4.5}, 3.10 + "c":{"aminoacids":"IV", "100%":-3.23786383, "70%":-4, "30%":-4.5}, 3.11 + "a":{"aminoacids":"FWY", "100%":-3.53951953, "70%":-4, "30%":-4.5}, 3.12 + "d":{"aminoacids":"DE", "100%":-3.095419565, "70%":-4, "30%":-4.5}, 3.13 + "e":{"aminoacids":"KR", "100%":-3.13289427, "70%":-4, "30%":-4.5}, 3.14 + "g":{"aminoacids":"DN", "100%":-3.279283757, "70%":-4, "30%":-4.5}, 3.15 + "f":{"aminoacids":"EQ", "100%":-3.395928676, "70%":-4, "30%":-4.5}, 3.16 + "i":{"aminoacids":"ST", "100%":-2.805912948, "70%":-4, "30%":-4.5}, 3.17 + "h":{"aminoacids":"AG", "100%":-2.756330919, "70%":-4, "30%":-4.5}, 3.18 + "A":{"aminoacids":"A", "100%":-3.756330919, "70%":-5, "30%":-5.5}, 3.19 + "G":{"aminoacids":"G", "100%":-3.756330919, "70%":-5, "30%":-5.5}, 3.20 + "S":{"aminoacids":"S", "100%":-3.625934282, "70%":-5, "30%":-5.5}, 3.21 + "T":{"aminoacids":"T", "100%":-4.011587974, "70%":-5, "30%":-5.5}, 3.22 + "C":{"aminoacids":"C", "100%":-4.921390165, "70%":-6, "30%":-6.5}, 3.23 + "P":{"aminoacids":"P", "100%":-4.321928095, "70%":-5, "30%":-6.5}, 3.24 + "N":{"aminoacids":"N", "100%":-4.506352666, "70%":-5, "30%":-6.5}, 3.25 + "H":{"aminoacids":"H", "100%":-5.10780329, "70%":-6, "30%":-7 }, 3.26 + "K":{"aminoacids":"K", "100%":-3.795859283, "70%":-5, "30%":-5.5}, 3.27 + "R":{"aminoacids":"R", "100%":-4.573466862, "70%":-5, "30%":-5.5}, 3.28 + "D":{"aminoacids":"D", "100%":-4.083141235, "70%":-5, "30%":-5.5}, 3.29 + "E":{"aminoacids":"E", "100%":-4.10780329, "70%":-5, "30%":-5.5}, 3.30 + "Q":{"aminoacids":"Q", "100%":-4.756330919, "70%":-5, "30%":-5.5}, 3.31 + "I":{"aminoacids":"I", "100%":-4.717856771, "70%":-5, "30%":-5.5}, 3.32 + "L":{"aminoacids":"L", "100%":-3.717856771, "70%":-5, "30%":-5.5}, 3.33 + "V":{"aminoacids":"V", "100%":-3.878321443, "70%":-5, "30%":-5.5}, 3.34 + "F":{"aminoacids":"F", "100%":-4.64385619, "70%":-5, "30%":-5.5}, 3.35 + "W":{"aminoacids":"W", "100%":-6.265344567, "70%":-7, "30%":-7.5}, 3.36 + "Y":{"aminoacids":"Y", "100%":-4.921390165, "70%":-5, "30%":-5.5}, 3.37 + "M":{"aminoacids":"M", "100%":-5.795859283, "70%":-6.5,"30%": -7} 3.38 + } 3.39 + 3.40 +aminoacids2functional_groups={ 3.41 + "A":["A", "b", "h", "t"], 3.42 + "C":["C", "t"], 3.43 + "E":["E", "d", "f"], 3.44 + "D":["D", "d", "g"], 3.45 + "G":["G", "h", "t"], 3.46 + "F":["F", "a", "b"], 3.47 + "I":["I", "c", "b", "l"], 3.48 + "H":["H", "u"], 3.49 + "K":["K", "e", "u"], 3.50 + "M":["M", "b", "l"], 3.51 + "L":["L", "b", "l"], 3.52 + "N":["N", "g"], 3.53 + "Q":["Q", "f"], 3.54 + "P":["P"], 3.55 + "S":["S", "i", "t"], 3.56 + "R":["R", "e", "u"], 3.57 + "T":["T", "i", "t"], 3.58 + "W":["W", "a"], 3.59 + "V":["V", "c", "b", "l"], 3.60 + "Y":["Y", "a"] 3.61 +} 3.62 +