allpy

changeset 833:2060c8ef164b
Added script utils/convert_homology.py to convert homology files for similar alignments.
author: Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date: Mon, 18 Jul 2011 19:26:03 +0400
parents: 241c4dfb1383
children: 0dc6d90af0cf
files: utils/convert_homology.py
diffstat: 1 files changed, 78 insertions(+), 0 deletions(-) [+]
[-]

utils/convert_homology.py 78 utils/convert_homology.py 78
utils/convert_homology.py 78
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/utils/convert_homology.py	Mon Jul 18 19:26:03 2011 +0400
     1.3 @@ -0,0 +1,78 @@
     1.4 +#!/usr/bin/python
     1.5 +import sys
     1.6 +from allpy import protein, homology, processors
     1.7 +
     1.8 +aln1 = protein.Alignment().append_file(open(sys.argv[1]), format="markup")
     1.9 +aln2 = protein.Alignment().append_file(open(sys.argv[2]), format="markup")
    1.10 +hmg = open(sys.argv[3])
    1.11 +outfile_name = sys.argv[4]
    1.12 +
    1.13 +mon_map = {}
    1.14 +for seq1, seq2 in zip(aln1.sequences, aln2.sequences):
    1.15 +    assert seq1.name == seq2.name
    1.16 +    assert seq1.description == seq2.description
    1.17 +    seq2.name += "_2"
    1.18 +    aln = protein.Alignment().append_sequence(seq1).append_sequence(seq2)
    1.19 +    aln.realign(processors.Muscle())
    1.20 +    seq1.add_markup('number')
    1.21 +    seq2.add_markup('number')
    1.22 +    for column in aln.columns:
    1.23 +        if len(column) != 2:
    1.24 +            continue
    1.25 +        seq1_number = str(column[seq1].number)
    1.26 +        seq2_number = str(column[seq2].number)
    1.27 +        mon_map[seq1.name, seq1_number] = seq2_number
    1.28 +
    1.29 +col_map = {}
    1.30 +aln1_num = aln1.add_markup('number')
    1.31 +aln2_num = aln2.add_markup('number')
    1.32 +for seq1 in aln1.sequences:
    1.33 +    for col1 in aln1.columns:
    1.34 +        num1 = str(aln1_num[col1])
    1.35 +        if num1 in col_map:
    1.36 +            continue
    1.37 +        if seq1 not in col1:
    1.38 +            continue
    1.39 +        key = seq1.name, str(col1[seq1].number)
    1.40 +        if key not in mon_map:
    1.41 +            continue
    1.42 +        num2 = None
    1.43 +        try:
    1.44 +            for seq2 in aln2.sequences:
    1.45 +                if seq2.name.startswith(seq1.name):
    1.46 +                    mon2 = seq2[col1[seq1].number]
    1.47 +                    for col2 in aln2.columns:
    1.48 +                        if col2[seq2] is mon2:
    1.49 +                            num2 = str(aln2_num[col2])
    1.50 +                            raise Exception
    1.51 +        except Exception:
    1.52 +            pass
    1.53 +        if num2 is not None:
    1.54 +            col_map[num1] = num2
    1.55 +
    1.56 +print sorted(col_map.items(), key=lambda (a,b): int(a))
    1.57 +
    1.58 +out = open(outfile_name, "w")
    1.59 +skipped = set()
    1.60 +for line in hmg:
    1.61 +    line = line.strip()
    1.62 +    if line == "" or line.startswith("#"):
    1.63 +        continue
    1.64 +    class_id, seq_id, monomer_id, column_id = line.split()
    1.65 +    key = seq_id, monomer_id
    1.66 +    if key not in mon_map:
    1.67 +        if key not in skipped:
    1.68 +            print "Skipping monomer", key
    1.69 +            skipped.add(key)
    1.70 +        continue
    1.71 +    if column_id not in col_map:
    1.72 +        if column_id not in skipped:
    1.73 +            print "Skipping column", repr(column_id)
    1.74 +            skipped.add(column_id)
    1.75 +        continue
    1.76 +    monomer_id = mon_map[key]
    1.77 +    column_id = col_map[column_id]
    1.78 +    out.write("\t".join([class_id, seq_id, monomer_id, column_id]) + "\n")
    1.79 +out.close()
    1.80 +
    1.81 +# vim: set et ts=4 sts=4 sw=4: