allpy

changeset 647:b35116e13f35
Changed the __repr__for sequence to <Sequence code1>. Renamed row() to row_as_list().
author: Boris Burkov <BurkovBA@gmail.com>
date: Wed, 08 Jun 2011 15:15:57 +0400
parents: 612c618fb7b0
children: f4f658d86ca1
files: allpy/base.py sequence_based_blocks_search/blocks_finder.py sequence_based_blocks_search/functional_groups.py
diffstat: 3 files changed, 325 insertions(+), 4 deletions(-) [+]
[-]

allpy/base.py 8

sequence_based_blocks_search/blocks_finder.py 262

sequence_based_blocks_search/functional_groups.py 59 allpy/base.py 8 sequence_based_blocks_search/blocks_finder.py 262 sequence_based_blocks_search/functional_groups.py 59
allpy/base.py 8
sequence_based_blocks_search/blocks_finder.py 262
     1.1 --- a/allpy/base.py	Tue Jun 07 17:37:06 2011 +0400
     1.2 +++ b/allpy/base.py	Wed Jun 08 15:15:57 2011 +0400
     1.3 @@ -82,7 +82,7 @@
     1.4          return cls.by_name[name.strip().capitalize()]()
     1.5  
     1.6      def __repr__(self):
     1.7 -        return str(self.code1)
     1.8 +        return "<Monomer %s>"  % str(self.code1)
     1.9  
    1.10      def __str__(self):
    1.11          """Returns one-letter code"""
    1.12 @@ -139,7 +139,7 @@
    1.13  
    1.14      def __repr__(self):
    1.15          if self.name:
    1.16 -            return str(self.name)
    1.17 +            return '<Sequence %s>' % str(self.name)
    1.18          else:
    1.19              return '<Sequence %s>' % str(self)
    1.20  
    1.21 @@ -231,7 +231,7 @@
    1.22      # Data access methods for alignment
    1.23      # =================================
    1.24  
    1.25 -    def row(self, sequence):
    1.26 +    def row_as_list(self, sequence):
    1.27          """Creates and returns temporary list of monomers and Nones"""
    1.28          output=[]
    1.29          for column in self.columns:
    1.30 @@ -247,7 +247,7 @@
    1.31              if monomer:
    1.32                  return monomer.code1
    1.33              return "-"
    1.34 -        row = self.row(sequence)
    1.35 +        row = self.row_as_list(sequence)
    1.36          list_of_letters = map(char, row)
    1.37          output=""
    1.38          for letter in list_of_letters: output+=letter

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/sequence_based_blocks_search/blocks_finder.py	Wed Jun 08 15:15:57 2011 +0400
     2.3 @@ -0,0 +1,262 @@
     2.4 +#!/usr/bin/python
     2.5 +
     2.6 +import sys
     2.7 +sys.path.append("../")
     2.8 +from allpy import protein
     2.9 +import copy
    2.10 +import math
    2.11 +from functional_groups import functional_groups
    2.12 +
    2.13 +class Preblock(object):
    2.14 +    """Attributes:
    2.15 +    first_column_index=self.alignment.columns[column1]
    2.16 +    last_column_index=self.aligmnent.columns[column2]
    2.17 +    sequences=[sequence1, sequence2...]
    2.18 +    alignment
    2.19 +
    2.20 +    A preblock score is calculated as follows:
    2.21 +    Suppose, that a preblock contains 2 conserved and 3 inconserved positions.
    2.22 +    Preblock score is sum of scores of its conserved and inconserved columns:
    2.23 +    Score(conserved column) = log(p(group|70% conservation)) - n*log(f(group))
    2.24 +    Score(inconserved column) = log(0.3) - log(1)
    2.25 +    Thus score of preblock contains 3 terms: sum of scores of inconserved column
    2.26 +    sum of log(p(group|70% conservation)) and sum of log(f(group)).
    2.27 +    """
    2.28 +    def __init__(self, first_column_index, last_column_index, sequences, term1, term2, term3, alignment):
    2.29 +        self.first_column_index = first_column_index
    2.30 +        self.last_column_index = last_column_index
    2.31 +        self.sequences = sequences
    2.32 +        self.term1 = term1
    2.33 +        self.term2 = term2
    2.34 +        self.term3 = term3
    2.35 +        self.alignment = alignment
    2.36 +
    2.37 +    def calculate_score(self):
    2.38 +        return self.term1 - len(self.sequences)*self.term2 + self.term3
    2.39 +
    2.40 +    def __repr__(self):
    2.41 +        return "preblock:%s, %s, %s, %s"%(self.first_column_index, self.last_column_index, self.calculate_score(), self.sequences)
    2.42 +
    2.43 +
    2.44 +class BreakCycleException(Exception):
    2.45 +    pass
    2.46 +
    2.47 +
    2.48 +def main(file):
    2.49 +    alignment = protein.Alignment().append_file(file)
    2.50 +    groups_content = find_groups_content(alignment, functional_groups)
    2.51 +    find_links(groups_content, alignment)
    2.52 +    blocks = mark_blocks(alignment)
    2.53 +    delete_links(alignment)
    2.54 +    return blocks
    2.55 +
    2.56 +
    2.57 +def find_groups_content(alignment, functional_groups):
    2.58 +    groups_content = {}
    2.59 +    for column in alignment.columns:
    2.60 +        groups2sequences = find_sequences_that_belong_to_groups_in_column(column, functional_groups)
    2.61 +        groups_content[column] = groups2sequences
    2.62 +    return groups_content
    2.63 +
    2.64 +
    2.65 +def find_sequences_that_belong_to_groups_in_column(column, functional_groups):
    2.66 +    output = {}
    2.67 +    for group_name in functional_groups.keys():
    2.68 +        output[group_name] = []
    2.69 +        for sequence in column.keys():
    2.70 +            if column[sequence].code1 in functional_groups[group_name]["aminoacids"]:
    2.71 +                output[group_name].append(sequence)
    2.72 +    return output
    2.73 +
    2.74 +
    2.75 +def find_links(groups_content, alignment):
    2.76 +    candidate_preblocks = []
    2.77 +    for index_of_column, column in enumerate(alignment.columns):
    2.78 +        candidate_preblocks, addition_to_candidate_preblocks = split_preblocks_by_presense_of_gaps(candidate_preblocks, column)
    2.79 +        #create new preblocks based on the groups
    2.80 +        try:
    2.81 +            for group_name in functional_groups:
    2.82 +                if len(groups_content[column][group_name])>1:
    2.83 +                    addition_to_candidate_preblocks.append(Preblock(index_of_column, index_of_column, copy.copy(groups_content[column][group_name]), functional_groups[group_name]["70%"], functional_groups[group_name]["100%"], 0.0, alignment))
    2.84 +        except KeyError as ke:
    2.85 +            print "Found non-canonical aminoacid: %s"%(ke)
    2.86 +        #create new preblocks as combinations of the old ones 
    2.87 +        try:
    2.88 +            for group_name in functional_groups:
    2.89 +                for preblock in candidate_preblocks:
    2.90 +                    if len([sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences]) > 1:
    2.91 +                        addition_to_candidate_preblocks.append(Preblock(preblock.first_column_index, index_of_column, [sequence_in_block for sequence_in_block in groups_content[column][group_name] if sequence_in_block in preblock.sequences], preblock.term1+functional_groups[group_name]["70%"], preblock.term2+functional_groups[group_name]["100%"], preblock.term3, alignment))
    2.92 +        except KeyError as ke:
    2.93 +            print "Found non-canonical aminoacid: %s"%(ke)
    2.94 +        #add one inconserved column to the old ones, merge all together
    2.95 +        for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1)
    2.96 +        candidate_preblocks+=addition_to_candidate_preblocks
    2.97 +        #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have smaller score
    2.98 +        print index_of_column, len(candidate_preblocks),
    2.99 +        remove_contained_preblocks(candidate_preblocks)
   2.100 +        print len(candidate_preblocks)
   2.101 +        #for those preblocks ending with a conserved position and having the score above threshold, add new links
   2.102 +        for preblock in candidate_preblocks:
   2.103 +            if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
   2.104 +                create_links(preblock, alignment) #links are not cliques, just connected components 
   2.105 +
   2.106 +
   2.107 +def split_preblocks_by_presense_of_gaps(preblocks, column):
   2.108 +    """Each preblock is split into up to two preblocks vertically:
   2.109 +       those, with sequences, that contain gaps and those, that
   2.110 +       don't.
   2.111 +    """
   2.112 +    preblocks_without_gap = []
   2.113 +    preblocks_with_gap = []
   2.114 +    for preblock in preblocks:
   2.115 +        preblock_sequences_without_gap = []
   2.116 +        preblock_sequences_with_gap = []
   2.117 +        for sequence in preblock.sequences:
   2.118 +            if sequence in column.keys():
   2.119 +                preblock_sequences_without_gap.append(sequence)
   2.120 +            else:
   2.121 +                preblock_sequences_with_gap.append(sequence)
   2.122 +        if len(preblock_sequences_without_gap)>=2:
   2.123 +            preblocks_without_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_without_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment))#added one inconserved position
   2.124 +        if len(preblock_sequences_with_gap)>=2:
   2.125 +            preblocks_with_gap.append(Preblock(preblock.first_column_index, preblock.last_column_index, preblock_sequences_with_gap, preblock.term1, preblock.term2, preblock.term3, preblock.alignment))
   2.126 +    return preblocks_without_gap, preblocks_with_gap
   2.127 +
   2.128 +
   2.129 +def remove_contained_preblocks(preblocks):
   2.130 +    """removes blocks that have rivals with same sets of sequences,
   2.131 +    but shorter and with inferior scores, normalized to the number
   2.132 +    of sequences within the block
   2.133 +
   2.134 +    """
   2.135 +    output=[]
   2.136 +    counter = len(preblocks)-1
   2.137 +    while counter!=-1:
   2.138 +        for preblock in preblocks:
   2.139 +            try:
   2.140 +                if preblock is not preblocks[counter]:
   2.141 +                    for sequence in preblocks[counter].sequences:
   2.142 +                        if sequence not in preblock.sequences: raise BreakCycleException
   2.143 +                    #if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted.
   2.144 +                    if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\
   2.145 +                        and preblocks[counter].last_column_index<=preblock.last_column_index:
   2.146 +                        preblocks.remove(preblocks[counter])
   2.147 +                        break
   2.148 +                    #create normalized preblock with same set of sequences as the examined one
   2.149 +                    normalized_preblock = Preblock(preblock.first_column_index, preblock.last_column_index, preblocks[counter].sequences, preblock.term1, preblock.term2, preblock.term3, preblock.alignment)
   2.150 +                    if preblocks[counter].first_column_index>=preblock.first_column_index\
   2.151 +                        and preblocks[counter].last_column_index<=preblock.last_column_index\
   2.152 +                        and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score():
   2.153 +                        preblocks.remove(preblocks[counter])
   2.154 +                        break
   2.155 +            except BreakCycleException:
   2.156 +                pass
   2.157 +        counter-=1
   2.158 +
   2.159 +
   2.160 +def calculate_threshold(number_of_sequences_in_preblock, number_of_sequences_in_alignment, functional_groups):
   2.161 +    """
   2.162 +    We seek to find the probability, that given n randomly generated
   2.163 +    sequences we won't find alignment of them, such that there is a
   2.164 +    block of k sequences with score not less than the observed.
   2.165 +
   2.166 +    First, we have to pick 5 sequences out of 20;
   2.167 +    then iterate over all the ways to shift them against each other.
   2.168 +
   2.169 +    The number of shifts of length 1 is roughly 5*4*len(alignment)-1)^3
   2.170 +    The number of shifts of length 2 is roughly 5*4*len(alignment)-2)^3
   2.171 +    ...
   2.172 +
   2.173 +    Considering these shifts independent, we seek to find the p-values
   2.174 +    for each of the scores to be not found in all the stretches.
   2.175 +
   2.176 +    Thus, for each shift we calculate the weights of all groups and
   2.177 +    inconserved positions, create the transition matrix, calculate
   2.178 +    its eigenvectors and eigenvalues, round the weights, find coefficient
   2.179 +    of [1,0,0,...] decomposition in the basis of eigenvectors.
   2.180 +    
   2.181 +
   2.182 +    """
   2.183 +
   2.184 +    #length_of_stretch = math.factorial(number_of_sequences_in_alignment)/math.factorial(number_of_sequences_in_preblock)/math.factorial(number_of_sequences_in_preblock)*^number_of_sequences_in_alignment
   2.185 +    return (-1)*functional_groups["u"]["70%"]*1.5*number_of_sequences_in_preblock
   2.186 +
   2.187 +
   2.188 +def create_links(preblock, alignment):
   2.189 +    """links are lists of sequences"""
   2.190 +    for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]:
   2.191 +        if preblock.sequences[0] not in column: continue #ignore gap columns
   2.192 +        for sequence in preblock.sequences:
   2.193 +            if "links" not in column[sequence].__dict__:
   2.194 +                column[sequence].links=[]
   2.195 +            remaining_sequences = copy.copy(preblock.sequences)
   2.196 +            remaining_sequences.remove(sequence)
   2.197 +            for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
   2.198 +                if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
   2.199 +                if "links" not in column[iterated_sequence].__dict__:
   2.200 +                    column[iterated_sequence].links=[sequence]
   2.201 +                else:
   2.202 +                    if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
   2.203 +
   2.204 +
   2.205 +def mark_blocks(alignment):
   2.206 +    output=[]
   2.207 +    blocks_in_previous_column = []
   2.208 +    for column in alignment.columns:
   2.209 +        blocks_in_this_column = []
   2.210 +        remaining_sequences=copy.copy(alignment.sequences)
   2.211 +        while remaining_sequences !=[]:
   2.212 +            connected_component=find_connected_component(remaining_sequences, column)
   2.213 +            if connected_component==[]: break
   2.214 +            #if connected_component extends existing block, extend that block, else create new one
   2.215 +            appended_to_block_flag = False
   2.216 +            for block in blocks_in_previous_column:
   2.217 +                if set(connected_component)==set(block.sequences):
   2.218 +                    block.columns.append(column)
   2.219 +                    blocks_in_this_column.append(block)
   2.220 +                    appended_to_block_flag = True
   2.221 +            if appended_to_block_flag==False:
   2.222 +               blocks_in_this_column.append(protein.Block.from_alignment(alignment, connected_component, [column]))
   2.223 +            for sequence in connected_component:
   2.224 +                remaining_sequences.remove(sequence)
   2.225 +        for block in blocks_in_previous_column:
   2.226 +            if block not in blocks_in_this_column:
   2.227 +                output.append(block)
   2.228 +        blocks_in_previous_column = blocks_in_this_column
   2.229 +        if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column
   2.230 +    return output
   2.231 +               
   2.232 +                
   2.233 +def find_connected_component(sequences, column):
   2.234 +    #find one sequence that has blocks
   2.235 +    output=[]
   2.236 +    if sequences!=[]:
   2.237 +        for sequence in sequences:
   2.238 +            if sequence not in column: continue
   2.239 +            if "links" in column[sequence].__dict__:
   2.240 +                output.append(sequence)
   2.241 +                break
   2.242 +        if output == []: return output
   2.243 +        #find all sequences in the same connected component
   2.244 +        not_exhausted_flag=True
   2.245 +        sequences_copy=copy.copy(sequences)
   2.246 +        output_counter=0
   2.247 +        while not_exhausted_flag:
   2.248 +            not_exhausted_flag = False
   2.249 +            for sequence in column[output[output_counter]].links:
   2.250 +                if sequence not in output and sequence in sequences:
   2.251 +                    output.append(sequence)
   2.252 +                    output_counter+=1
   2.253 +                    not_exhausted_flag=True
   2.254 +    return output
   2.255 +                
   2.256 +
   2.257 +def delete_links(alignment):
   2.258 +    for sequence in alignment.sequences:
   2.259 +        for monomer in sequence:
   2.260 +            if "links" in monomer.__dict__:
   2.261 +                del monomer.links
   2.262 +
   2.263 +   
   2.264 +if __name__== '__main__':
   2.265 +    main(open(sys.argv[1]))

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/sequence_based_blocks_search/functional_groups.py	Wed Jun 08 15:15:57 2011 +0400
     3.3 @@ -0,0 +1,59 @@
     3.4 +# group name: aminoacids in group, logprobability of group for 100% identical sequences, >60%, >30%
     3.5 +functional_groups={
     3.6 +                "b":{"aminoacids":"IVLMFA","100%":-1.671163536, "70%":-3,  "30%":-4.0},
     3.7 +                "t":{"aminoacids":"AGSTC", "100%":-1.625934282, "70%":-3,  "30%":-4.0},
     3.8 +                "u":{"aminoacids":"HKR",   "100%":-2.805912948, "70%":-4,  "30%":-4.5},
     3.9 +                "l":{"aminoacids":"IVLM",  "100%":-2.321928095, "70%":-4,  "30%":-4.5},
    3.10 +                "c":{"aminoacids":"IV",    "100%":-3.23786383,  "70%":-4,  "30%":-4.5},
    3.11 +                "a":{"aminoacids":"FWY",   "100%":-3.53951953,  "70%":-4,  "30%":-4.5},
    3.12 +                "d":{"aminoacids":"DE",    "100%":-3.095419565, "70%":-4,  "30%":-4.5},
    3.13 +                "e":{"aminoacids":"KR",    "100%":-3.13289427,  "70%":-4,  "30%":-4.5},
    3.14 +                "g":{"aminoacids":"DN",    "100%":-3.279283757, "70%":-4,  "30%":-4.5},
    3.15 +                "f":{"aminoacids":"EQ",    "100%":-3.395928676, "70%":-4,  "30%":-4.5},
    3.16 +                "i":{"aminoacids":"ST",    "100%":-2.805912948, "70%":-4,  "30%":-4.5},
    3.17 +                "h":{"aminoacids":"AG",    "100%":-2.756330919, "70%":-4,  "30%":-4.5},
    3.18 +                "A":{"aminoacids":"A",     "100%":-3.756330919, "70%":-5,  "30%":-5.5},
    3.19 +                "G":{"aminoacids":"G",     "100%":-3.756330919, "70%":-5,  "30%":-5.5},
    3.20 +                "S":{"aminoacids":"S",     "100%":-3.625934282, "70%":-5,  "30%":-5.5},
    3.21 +                "T":{"aminoacids":"T",     "100%":-4.011587974, "70%":-5,  "30%":-5.5},
    3.22 +                "C":{"aminoacids":"C",     "100%":-4.921390165, "70%":-6,  "30%":-6.5},
    3.23 +                "P":{"aminoacids":"P",     "100%":-4.321928095, "70%":-5,  "30%":-6.5},
    3.24 +                "N":{"aminoacids":"N",     "100%":-4.506352666, "70%":-5,  "30%":-6.5},
    3.25 +                "H":{"aminoacids":"H",     "100%":-5.10780329,  "70%":-6,  "30%":-7  },
    3.26 +                "K":{"aminoacids":"K",     "100%":-3.795859283, "70%":-5,  "30%":-5.5},
    3.27 +                "R":{"aminoacids":"R",     "100%":-4.573466862, "70%":-5,  "30%":-5.5},
    3.28 +                "D":{"aminoacids":"D",     "100%":-4.083141235, "70%":-5,  "30%":-5.5},
    3.29 +                "E":{"aminoacids":"E",     "100%":-4.10780329,  "70%":-5,  "30%":-5.5},
    3.30 +                "Q":{"aminoacids":"Q",     "100%":-4.756330919, "70%":-5,  "30%":-5.5},
    3.31 +                "I":{"aminoacids":"I",     "100%":-4.717856771, "70%":-5,  "30%":-5.5},
    3.32 +                "L":{"aminoacids":"L",     "100%":-3.717856771, "70%":-5,  "30%":-5.5},
    3.33 +                "V":{"aminoacids":"V",     "100%":-3.878321443, "70%":-5,  "30%":-5.5},
    3.34 +                "F":{"aminoacids":"F",     "100%":-4.64385619,  "70%":-5,  "30%":-5.5},
    3.35 +                "W":{"aminoacids":"W",     "100%":-6.265344567, "70%":-7,  "30%":-7.5},
    3.36 +                "Y":{"aminoacids":"Y",     "100%":-4.921390165, "70%":-5,  "30%":-5.5},
    3.37 +                "M":{"aminoacids":"M",     "100%":-5.795859283, "70%":-6.5,"30%": -7}
    3.38 +                }
    3.39 +
    3.40 +aminoacids2functional_groups={
    3.41 +                "A":["A", "b", "h", "t"],
    3.42 +                "C":["C", "t"],
    3.43 +                "E":["E", "d", "f"],
    3.44 +                "D":["D", "d", "g"],
    3.45 +                "G":["G", "h", "t"],
    3.46 +                "F":["F", "a", "b"],
    3.47 +                "I":["I", "c", "b", "l"],
    3.48 +                "H":["H", "u"],
    3.49 +                "K":["K", "e", "u"],
    3.50 +                "M":["M", "b", "l"],
    3.51 +                "L":["L", "b", "l"],
    3.52 +                "N":["N", "g"],
    3.53 +                "Q":["Q", "f"],
    3.54 +                "P":["P"],
    3.55 +                "S":["S", "i", "t"],
    3.56 +                "R":["R", "e", "u"],
    3.57 +                "T":["T", "i", "t"],
    3.58 +                "W":["W", "a"],
    3.59 +                "V":["V", "c", "b", "l"],
    3.60 +                "Y":["Y", "a"]
    3.61 +}
    3.62 +