allpy

changeset 1050:00d6145cba0f 1.4.2
Changes in blocks_finder transferred from default to 1.4.2.
author: Boris Burkov <BurkovBA@gmail.com>
date: Mon, 26 Mar 2012 22:08:08 +0400
parents: 73579ca7c1b5
children: 027f0f2f5417
files: sequence_based_blocks_search/blocks_finder.py
diffstat: 1 files changed, 84 insertions(+), 13 deletions(-) [+]
[-]

sequence_based_blocks_search/blocks_finder.py 97 sequence_based_blocks_search/blocks_finder.py 97
sequence_based_blocks_search/blocks_finder.py 97
     1.1 --- a/sequence_based_blocks_search/blocks_finder.py	Fri Mar 23 01:34:48 2012 +0400
     1.2 +++ b/sequence_based_blocks_search/blocks_finder.py	Mon Mar 26 22:08:08 2012 +0400
     1.3 @@ -4,10 +4,47 @@
     1.4  sys.path.append("../")
     1.5  from allpy import protein
     1.6  from allpy.homology import *
     1.7 +from allpy import data
     1.8 +from allpy import base
     1.9  import copy
    1.10  import math
    1.11  from functional_groups import functional_groups
    1.12  from allpy.util import *
    1.13 +from optparse import OptionParser
    1.14 +
    1.15 +
    1.16 +class SbbsProteinMonomer(protein.Monomer):
    1.17 +    def __eq__(self, other):
    1.18 +        return self is other
    1.19 +
    1.20 +    def __ne__(self, other):
    1.21 +        return not self is other
    1.22 +
    1.23 +SbbsProteinMonomer._initialize(data.codes.protein)
    1.24 +
    1.25 +class SbbsProteinAlignment(protein.Alignment):
    1.26 +    def rows_as_strings(self, gaps='-'):
    1.27 +        rows = []
    1.28 +        for sequence in self.sequences:
    1.29 +            string = ""
    1.30 +            for column in self.columns:
    1.31 +                if sequence in column:
    1.32 +                    if hasattr(column[sequence], "input_code1"):
    1.33 +                        string += column[sequence].input_code1
    1.34 +                    else:
    1.35 +                        string += column[sequence].code1
    1.36 +                else:
    1.37 +                    string += gap
    1.38 +            string = util.UserString(string)
    1.39 +            string.sequence = sequence
    1.40 +            rows.append(string)
    1.41 +        return rows
    1.42 +
    1.43 +class SbbsProteinColumn(base.Column):
    1.44 +    def __eq__(self, other):
    1.45 +        return self is other
    1.46 +
    1.47 +vars(base)['Column'] = SbbsProteinColumn
    1.48  
    1.49  class Preblock(object):
    1.50      """Attributes:
    1.51 @@ -51,7 +88,10 @@
    1.52  
    1.53  
    1.54  def main(file):
    1.55 -    alignment = protein.Alignment().append_file(file)
    1.56 +    try:
    1.57 +        alignment = SbbsProteinAlignment().append_file(file)
    1.58 +    except Exception as E:
    1.59 +        raise Exception("Failed to read alignment\nCheck alignment format")
    1.60      groups_content = find_groups_content(alignment, functional_groups)
    1.61      find_links(groups_content, alignment)
    1.62      blocks = mark_blocks(alignment)
    1.63 @@ -88,7 +128,7 @@
    1.64                      addition_to_candidate_preblocks.append(Preblock(index_of_column, index_of_column, copy.copy(groups_content[column][group_name]), functional_groups[group_name]["70%"], functional_groups[group_name]["100%"], 0.0, alignment))
    1.65          except KeyError as ke:
    1.66              print "Found non-canonical aminoacid: %s"%(ke)
    1.67 -        #create new preblocks as combinations of the old ones 
    1.68 +        #create new preblocks as combinations of the old ones
    1.69          try:
    1.70              for preblock in candidate_preblocks:
    1.71                  preblock_extensions = []
    1.72 @@ -117,7 +157,7 @@
    1.73          overlapping_preblocks=[]
    1.74          for preblock in candidate_preblocks:
    1.75              if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
    1.76 -                create_links(preblock, alignment) #links are not cliques, just connected components 
    1.77 +                create_links(preblock, alignment) #links are not cliques, just connected components
    1.78                  overlapping_preblocks+=accept_and_return_overlapping_preblocks_before(preblock, alignment, candidate_preblocks)
    1.79              if preblock not in overlapping_preblocks: new_candidate_preblocks.append(preblock)
    1.80          candidate_preblocks = new_candidate_preblocks
    1.81 @@ -218,7 +258,7 @@
    1.82      inconserved positions, create the transition matrix, calculate
    1.83      its eigenvectors and eigenvalues, round the weights, find coefficient
    1.84      of [1,0,0,...] decomposition in the basis of eigenvectors.
    1.85 -    
    1.86 +
    1.87  
    1.88      """
    1.89  
    1.90 @@ -244,13 +284,13 @@
    1.91  #                    column[iterated_sequence].links=[sequence]
    1.92  #                else:
    1.93  #                    if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
    1.94 -       
    1.95 +
    1.96          if "links" not in column[any_sequence].__dict__:
    1.97              column[any_sequence].links=[]
    1.98          remaining_sequences = copy.copy(preblock.sequences)
    1.99          remaining_sequences.remove(any_sequence)
   1.100          for sequence in preblock.sequences:#links are bidirectional
   1.101 -            if sequence not in column[any_sequence].links: 
   1.102 +            if sequence not in column[any_sequence].links:
   1.103                  column[any_sequence].links.append(sequence)
   1.104                  if "links" not in column[sequence].__dict__:
   1.105                      column[sequence].links=[any_sequence]
   1.106 @@ -304,8 +344,8 @@
   1.107          #print index_of_column, [gblock.columns for gblock in blocks_in_this_column]
   1.108          if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column
   1.109      return output
   1.110 -               
   1.111 -                
   1.112 +
   1.113 +
   1.114  def find_connected_component(sequences, column):
   1.115      """output is a list of sequences"""
   1.116      #find one sequence that has blocks
   1.117 @@ -326,7 +366,7 @@
   1.118                  if another_sequence not in queue and another_sequence not in output and another_sequence in sequences:
   1.119                      queue.append(another_sequence)
   1.120      return output
   1.121 -                
   1.122 +
   1.123  
   1.124  def delete_links(alignment):
   1.125      for sequence in alignment.sequences:
   1.126 @@ -334,7 +374,7 @@
   1.127              if "links" in monomer.__dict__:
   1.128                  del monomer.links
   1.129  
   1.130 -#[gblock for gblock in blocks_in_previous_column for gcolumn in gblock.columns if alignment.columns.index(gcolumn)==147]   
   1.131 +#[gblock for gblock in blocks_in_previous_column for gcolumn in gblock.columns if alignment.columns.index(gcolumn)==147]
   1.132  
   1.133  def create_monomer_homology(alignment):
   1.134      """returns MonomerHomology object, given alignment
   1.135 @@ -380,8 +420,16 @@
   1.136          sys.exit()
   1.137      #creating markups
   1.138      for sequence in alignment.sequences:
   1.139 -        sequence.add_markup('index')
   1.140 -    aim = alignment.add_markup('index')
   1.141 +        if 'markups' in sequence.__dict__:
   1.142 +            if 'index' in sequence.markups:
   1.143 +                pass
   1.144 +            else:
   1.145 +                sequence.add_markup('index')
   1.146 +    if 'markups' in alignment.__dict__:
   1.147 +        if 'index' in alignment.markups:
   1.148 +            aim = alignment.markups['index']
   1.149 +        else:
   1.150 +            aim = alignment.add_markup('index')
   1.151      #inferring classes_of_equivalence = homologous monomers = connected_components from links
   1.152      class_number = 0
   1.153      for column in alignment.columns:
   1.154 @@ -408,4 +456,27 @@
   1.155  
   1.156  
   1.157  if __name__== '__main__':
   1.158 -    main(open(sys.argv[1]))
   1.159 +    usage = "usage: %prog [options] arg"
   1.160 +    parser = OptionParser(usage)
   1.161 +    parser.add_option("-c", "--classes", dest="output_classes", help="Name of optional output file with homology classes")
   1.162 +    (options, args) = parser.parse_args()
   1.163 +    input_file = open(args[0])
   1.164 +    blocks = main(input_file)
   1.165 +    if blocks == []:
   1.166 +        sys.exit()
   1.167 +    aim = blocks[0].alignment.add_markup('index') #create markup
   1.168 +    file_content=""
   1.169 +    for index, block in enumerate(blocks):
   1.170 +        for sequence in block.sequences:
   1.171 +            file_content+=str(str(args[1]))+"\t"+str(index)+"\t"+str(sequence.name)+"\t"
   1.172 +            buffer=""
   1.173 +            for column in block.columns:
   1.174 +                buffer+=str(aim[column])+" "
   1.175 +            if (len(buffer)!=0): buffer=buffer[:-1]
   1.176 +            file_content+=buffer+"\n"
   1.177 +
   1.178 +    output_file = open(args[1], 'w')
   1.179 +    output_file.write(file_content)
   1.180 +
   1.181 +    if options.output_classes:
   1.182 +        create_file_with_monomer_homology(blocks[0].alignment, options.output_classes)