allpy

changeset 648:f4f658d86ca1
Some testing done for blocks_finder:remove_contained_preblocks time-consumption optimisation.
author: Boris Burkov <BurkovBA@gmail.com>
date: Fri, 10 Jun 2011 15:33:16 +0400
parents: b35116e13f35
children: e3f06e45de03
files: sequence_based_blocks_search/blocks_finder.py
diffstat: 1 files changed, 24 insertions(+), 3 deletions(-) [+]
[-]

sequence_based_blocks_search/blocks_finder.py 27 sequence_based_blocks_search/blocks_finder.py 27
sequence_based_blocks_search/blocks_finder.py 27
     1.1 --- a/sequence_based_blocks_search/blocks_finder.py	Wed Jun 08 15:15:57 2011 +0400
     1.2 +++ b/sequence_based_blocks_search/blocks_finder.py	Fri Jun 10 15:33:16 2011 +0400
     1.3 @@ -35,7 +35,13 @@
     1.4          return self.term1 - len(self.sequences)*self.term2 + self.term3
     1.5  
     1.6      def __repr__(self):
     1.7 -        return "preblock:%s, %s, %s, %s"%(self.first_column_index, self.last_column_index, self.calculate_score(), self.sequences)
     1.8 +        representation_of_sequences = "["
     1.9 +        for sequence in self.sequences:
    1.10 +            if sequence.name: representation_of_sequences += ""+str(sequence.name)+", "
    1.11 +            else: representation_of_sequences += ""+repr(sequence.name)+", "
    1.12 +        if representation_of_sequences!="[": representation_of_sequences=representation_of_sequences[:-2]+"]"
    1.13 +        else: representation_of_sequences+="]"
    1.14 +        return "<Preblock %s, %s, %s, %s>"%(self.first_column_index, self.last_column_index, self.calculate_score(), representation_of_sequences)
    1.15  
    1.16  
    1.17  class BreakCycleException(Exception):
    1.18 @@ -91,10 +97,15 @@
    1.19          #add one inconserved column to the old ones, merge all together
    1.20          for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1)
    1.21          candidate_preblocks+=addition_to_candidate_preblocks
    1.22 -        #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have smaller score
    1.23 +        #remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have lower scores
    1.24          print index_of_column, len(candidate_preblocks),
    1.25 -        remove_contained_preblocks(candidate_preblocks)
    1.26 +        if len(candidate_preblocks)>1000:
    1.27 +            import cProfile
    1.28 +            cProfile.runctx('remove_contained_preblocks(candidate_preblocks)',globals(),locals())
    1.29 +        else:
    1.30 +            remove_contained_preblocks(candidate_preblocks)
    1.31          print len(candidate_preblocks)
    1.32 +        remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10)
    1.33          #for those preblocks ending with a conserved position and having the score above threshold, add new links
    1.34          for preblock in candidate_preblocks:
    1.35              if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
    1.36 @@ -154,6 +165,16 @@
    1.37          counter-=1
    1.38  
    1.39  
    1.40 +def remove_bad_candidates(preblocks, index_of_column, relative_weight_threshold, distance_threshold):
    1.41 +    for preblock in preblocks:
    1.42 +        if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups):
    1.43 +            pass
    1.44 +        else:
    1.45 +            if preblock.calculate_score() < calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\
    1.46 +                and index_of_column - preblock.first_column_index > distance_threshold:
    1.47 +                preblocks.remove(preblock)
    1.48 +
    1.49 +
    1.50  def calculate_threshold(number_of_sequences_in_preblock, number_of_sequences_in_alignment, functional_groups):
    1.51      """
    1.52      We seek to find the probability, that given n randomly generated