Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/2f016bfda114
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 08:09:00 2012
Кодировка:

# HG changeset patch
# User Boris Burkov
# Date 1309499569 -14400
# Node ID 2f016bfda1145d5fbacae1ba90d9db0879c574fe
# Parent c32e728b98d713bdd5f388bb4d7707044e35308e
Massive changes to my program

diff -r c32e728b98d7 -r 2f016bfda114 allpy/base.py
--- a/allpy/base.py Tue Jun 14 13:31:18 2011 +0400
+++ b/allpy/base.py Fri Jul 01 09:52:49 2011 +0400
@@ -89,6 +89,10 @@

def __eq__(self, other):
"""Monomers within same monomer type are compared by code1."""
+ #import traceback
+ #for entry in traceback.extract_stack():
+ # if "remove_contained_preblocks" in entry[2]:
+ # print traceback.extract_stack()
if not other:
return False
assert self.type == other.type
@@ -151,6 +155,7 @@
"""Hash sequence by identity."""
return id(self)

+
class Alignment(object):
"""Alignment. It is a list of Columns."""

@@ -451,8 +456,16 @@
"""Return hash by identity."""
return id(self)

+ def MyIndex(self):
+ for index, column in enumerate(self.alignment.columns):
+ if column is self: return index
+ raise ValueException
+
def __repr__(self):
- return ""%(str(self.alignment.columns.index(self)))
+ #!!!!!!!!! READ HOW index OF LIST COMPARES OBJECTS AND BASIC TYPES
+ return ""%(str(self.MyIndex()))
+
+

class Block(Alignment):
diff -r c32e728b98d7 -r 2f016bfda114 sequence_based_blocks_search/blocks_finder.py
--- a/sequence_based_blocks_search/blocks_finder.py Tue Jun 14 13:31:18 2011 +0400
+++ b/sequence_based_blocks_search/blocks_finder.py Fri Jul 01 09:52:49 2011 +0400
@@ -101,18 +101,24 @@
for preblock in candidate_preblocks: preblock.term3 += math.log(0.3) - math.log(1)
candidate_preblocks+=addition_to_candidate_preblocks
#remove from candidates those preblocks that share the same set of sequences with others, but are shorter and have lower scores
- print index_of_column, len(candidate_preblocks),
+ #print index_of_column, len(candidate_preblocks),
+ print index_of_column, len(candidate_preblocks)
if len(candidate_preblocks)>1000:
import cProfile
cProfile.runctx('remove_contained_preblocks(candidate_preblocks)',globals(),locals())
else:
remove_contained_preblocks(candidate_preblocks)
- print len(candidate_preblocks)
- remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10)
+ #print len(candidate_preblocks)
+ #remove_bad_candidates(candidate_preblocks, index_of_column, 0.5, 10)
#for those preblocks ending with a conserved position and having the score above threshold, add new links
+ new_candidate_preblocks=[]
+ overlapping_preblocks=[]
for preblock in candidate_preblocks:
if preblock.alignment.columns[preblock.last_column_index] is column and preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(alignment.sequences), functional_groups):
create_links(preblock, alignment) #links are not cliques, just connected components
+ overlapping_preblocks+=accept_and_return_overlapping_preblocks_before(preblock, alignment, candidate_preblocks)
+ if preblock not in overlapping_preblocks: new_candidate_preblocks.append(preblock)
+ candidate_preblocks = new_candidate_preblocks

def split_preblocks_by_presense_of_gaps(preblocks, column):
@@ -146,11 +152,21 @@
output=[]
counter = len(preblocks)-1
while counter!=-1:
- for preblock in preblocks:
- try:
- if preblock is not preblocks[counter]:
+ for index_of_preblock, preblock in enumerate(preblocks):
+ if index_of_preblock != counter:
+ try:
for sequence in preblocks[counter].sequences:
- if sequence not in preblock.sequences: raise BreakCycleException
+ sequence_present=False
+ for another_sequence in preblock.sequences:
+ if sequence is another_sequence:
+ sequence_present=True
+ break
+ if not sequence_present: raise BreakCycleException
+# for sequence in preblocks[counter].sequences:
+# if sequence not in preblock.sequences:
+# raise BreakCycleException
+
+ #if sequence not in preblock.sequences: raise BreakCycleException
#if preblocks passes the threshold of weight, discard the preblocks[counter] whatever, cause, it or at least its part should've been already accepted.
if preblock.calculate_score() > calculate_threshold(len(preblock.sequences), len(preblock.alignment.sequences), functional_groups)\
and preblocks[counter].last_column_index<=preblock.last_column_index:
@@ -163,8 +179,8 @@
and preblocks[counter].calculate_score() <= normalized_preblock.calculate_score():
preblocks.remove(preblocks[counter])
break
- except BreakCycleException:
- pass
+ except BreakCycleException:
+ pass
counter-=1

@@ -207,26 +223,60 @@

def create_links(preblock, alignment):
- """links are lists of sequences"""
+ """links are lists of sequences, we create N links rather than N^2 - no cliques!"""
for column in alignment.columns[preblock.first_column_index:preblock.last_column_index+1]:
- if preblock.sequences[0] not in column: continue #ignore gap columns
- for sequence in preblock.sequences:
- if "links" not in column[sequence].__dict__:
- column[sequence].links=[]
- remaining_sequences = copy.copy(preblock.sequences)
- remaining_sequences.remove(sequence)
- for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
- if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
- if "links" not in column[iterated_sequence].__dict__:
- column[iterated_sequence].links=[sequence]
+ #any_sequence = preblock.sequences.pop()
+ #preblock.sequences.add(any_sequence)
+ any_sequence = preblock.sequences[0]
+ if any_sequence not in column: continue #ignore gap columns
+# for sequence in preblock.sequences:
+# if "links" not in column[sequence].__dict__:
+# column[sequence].links=[]
+# remaining_sequences = copy.copy(preblock.sequences)
+# remaining_sequences.remove(sequence)
+# for iterated_sequence in remaining_sequences:#create links both for sequence and iterated sequence
+# if iterated_sequence not in column[sequence].links: column[sequence].links.append(iterated_sequence)
+# if "links" not in column[iterated_sequence].__dict__:
+# column[iterated_sequence].links=[sequence]
+# else:
+# if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
+
+ if "links" not in column[any_sequence].__dict__:
+ column[any_sequence].links=[]
+ remaining_sequences = copy.copy(preblock.sequences)
+ remaining_sequences.remove(any_sequence)
+ for sequence in preblock.sequences:#links are bidirectional
+ if sequence not in column[any_sequence].links:
+ column[any_sequence].links.append(sequence)
+ if "links" not in column[sequence].__dict__:
+ column[sequence].links=[any_sequence]
else:
- if sequence not in column[iterated_sequence].links: column[iterated_sequence].links.append(sequence)
+ if any_sequence not in column[sequence].links:
+ column[sequence].links.append(any_sequence)
+
+
+
+def accept_and_return_overlapping_preblocks_before(preblock, alignment, preblocks):
+ def check_sublist(greater_list, smaller_list):
+ for element in smaller_list:
+ if element not in greater_list: return False
+ return True
+
+ output=[]
+ for current_preblock in preblocks:
+ if current_preblock.last_column_index >= preblock.first_column_index and\
+ current_preblock.last_column_index <= preblock.last_column_index and\
+ check_sublist(preblock.sequences, current_preblock.sequences) and\
+ current_preblock is not preblock:
+ create_links(current_preblock, alignment)
+ output.append(current_preblock)
+ return output

def mark_blocks(alignment):
output=[]
blocks_in_previous_column = []
- for column in alignment.columns:
+ for index_of_column, column in enumerate(alignment.columns):
blocks_in_this_column = []
remaining_sequences=copy.copy(alignment.sequences)
while remaining_sequences !=[]:
@@ -247,31 +297,29 @@
if block not in blocks_in_this_column:
output.append(block)
blocks_in_previous_column = blocks_in_this_column
+ #print index_of_column, [gblock.columns for gblock in blocks_in_this_column]
if alignment.columns.index(column) == len(alignment.columns)-1: output+=blocks_in_this_column
return output

def find_connected_component(sequences, column):
#find one sequence that has blocks
+ queue=[]
output=[]
if sequences!=[]:
for sequence in sequences:
if sequence not in column: continue
if "links" in column[sequence].__dict__:
- output.append(sequence)
+ queue.append(sequence)
break
- if output == []: return output
+ if queue == []: return output
#find all sequences in the same connected component
- not_exhausted_flag=True
- sequences_copy=copy.copy(sequences)
- output_counter=0
- while not_exhausted_flag:
- not_exhausted_flag = False
- for sequence in column[output[output_counter]].links:
- if sequence not in output and sequence in sequences:
- output.append(sequence)
- output_counter+=1
- not_exhausted_flag=True
+ while len(queue)>0:
+ sequence = queue.pop()
+ output.append(sequence)
+ for another_sequence in column[sequence].links:
+ if another_sequence not in queue and another_sequence not in output and another_sequence in sequences:
+ queue.append(another_sequence)
return output

@@ -281,6 +329,8 @@
if "links" in monomer.__dict__:
del monomer.links

-
+#[gblock for gblock in blocks_in_previous_column for gcolumn in gblock.columns if alignment.columns.index(gcolumn)==147]
+
+
if __name__== '__main__':
main(open(sys.argv[1]))