allpy

changeset 726:dc3e9ff89ab5
Merge
author: Boris Burkov <BurkovBA@gmail.com>
date: Fri, 08 Jul 2011 18:38:31 +0400
parents: 0a7917b4ca71 890fc647f69c
children: 633a961e7586 3c09dabe0e96
diffstat: 5 files changed, 122 insertions(+), 40 deletions(-) [+]
[-]

docs/source/allpy/data.monomers.rst 5

docs/source/allpy/homology.rst 12

docs/source/allpy/markups.rst 12

test/Makefile 2

utils/extract_pfam.py 131 docs/source/allpy/data.monomers.rst 5 docs/source/allpy/homology.rst 12 docs/source/allpy/markups.rst 12 test/Makefile 2 utils/extract_pfam.py 131
docs/source/allpy/data.monomers.rst 5
docs/source/allpy/homology.rst 12
docs/source/allpy/markups.rst 12
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/docs/source/allpy/data.monomers.rst	Fri Jul 08 18:38:31 2011 +0400
     1.3 @@ -0,0 +1,5 @@
     1.4 +Monomers Documentation
     1.5 +======================
     1.6 +
     1.7 +This page contains the Monomers Package documentation.
     1.8 +

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/docs/source/allpy/homology.rst	Fri Jul 08 18:38:31 2011 +0400
     2.3 @@ -0,0 +1,12 @@
     2.4 +Homology Documentation
     2.5 +======================
     2.6 +
     2.7 +This page contains the Homology Module documentation.
     2.8 +
     2.9 +The :mod:`homology` Module
    2.10 +--------------------------
    2.11 +
    2.12 +.. automodule:: allpy.homology
    2.13 +    :members:
    2.14 +    :undoc-members:
    2.15 +    :show-inheritance:

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/docs/source/allpy/markups.rst	Fri Jul 08 18:38:31 2011 +0400
     3.3 @@ -0,0 +1,12 @@
     3.4 +Markups Documentation
     3.5 +=====================
     3.6 +
     3.7 +This page contains the Markups Module documentation.
     3.8 +
     3.9 +The :mod:`markups` Module
    3.10 +-------------------------
    3.11 +
    3.12 +.. automodule:: allpy.markups
    3.13 +    :members:
    3.14 +    :undoc-members:
    3.15 +    :show-inheritance:

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/test/Makefile	Fri Jul 08 18:38:31 2011 +0400
     4.3 @@ -0,0 +1,2 @@
     4.4 +all:
     4.5 +	$(MAKE) -C .. tests

     5.1 --- a/utils/extract_pfam.py	Fri Jul 08 18:30:04 2011 +0400
     5.2 +++ b/utils/extract_pfam.py	Fri Jul 08 18:38:31 2011 +0400
     5.3 @@ -1,53 +1,104 @@
     5.4  #!/usr/bin/python
     5.5 -import optparse
     5.6  import gzip
     5.7 -import itertools
     5.8  import inspect
     5.9 +import os
    5.10 +import re
    5.11  from Bio import AlignIO, Align
    5.12 +from allpy import protein
    5.13  
    5.14 -name_by = "ac-id"
    5.15 +class Struct(object):
    5.16 +    pass
    5.17  
    5.18 -parser = optparse.OptionParser()
    5.19 -options, args = parser.parse_args()
    5.20 +options = Struct()
    5.21 +options.min_depth = 2
    5.22 +options.min_width = 20
    5.23 +options.infile_name = '/srv/databases/pfam/Pfam-A.full.gz'
    5.24 +options.outdir_name = '/srv/databases/pfam/test'
    5.25  
    5.26 -infile_name, = args
    5.27 -infile = gzip.open(infile_name)
    5.28 +re_range = re.compile(r'(-?\d+)-(-?\d+)')
    5.29 +def parse_dbxref(dbxref):
    5.30 +    db, pdb_chain, range, null = (part.strip() for part in dbxref.split(";"))
    5.31 +    assert db == "PDB", "DBxref not for PDB"
    5.32 +    assert null.strip() == "", "Too many fields in GF DB PDB"
    5.33 +    result = Struct()
    5.34 +    result.pdb_id, result.chain = pdb_chain.split()
    5.35 +    range = re_range.match(range)
    5.36 +    result.begin = int(range.group(1))
    5.37 +    result.end = int(range.group(2))
    5.38 +    result.len = result.end - result.begin
    5.39 +    return result
    5.40  
    5.41 -if name_by == "number":
    5.42 -    numbers = iter(itertools.count(1))
    5.43 -    def aln_name(aln):
    5.44 -        return "pfam_%05d.fasta" % numbers.next()
    5.45 -elif name_by == "ac-id":
    5.46 -    # Hack Biopython so that it contains some info from GF fields of stockholm
    5.47 -    # Class MultipleSeqAlignment is only created in StockholmIO once: when
    5.48 -    # parsing alignment file; we know locals() of function it is called from
    5.49 -    # variable 'gf' there contains dictionary of GF fields in the file
    5.50 -    class MultipleSeqAlignment(Align.MultipleSeqAlignment):
    5.51 -        def __init__(self, *args, **kw):
    5.52 -            f_locals = inspect.stack()[1][0].f_locals
    5.53 -            self.gf = f_locals['gf']
    5.54 -            self.ac, = self.gf['AC']
    5.55 -            self.id, = self.gf['ID']
    5.56 -            Align.MultipleSeqAlignment.__init__(self, *args, **kw)
    5.57 -    AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment
    5.58 +def parse_pdb_codes(seq):
    5.59 +    codes = [parse_dbxref(dbxref)
    5.60 +        for dbxref in seq.dbxrefs if 'PDB' in dbxref]
    5.61 +    assert codes, "No PDB data"
    5.62 +    return list(reversed(sorted(codes, key=lambda r: r.len)))
    5.63  
    5.64 -    def aln_name(aln):
    5.65 -        return "pfam_%s_%s.fasta" % (aln.ac, aln.id)
    5.66 +def append_seq(aln, seq, pdb_codes):
    5.67 +    code = pdb_codes[0]
    5.68 +    name = '%s_%s' % (code.pdb_id, code.chain)
    5.69 +    description = '%s %s %s' % (
    5.70 +        seq.id,
    5.71 +        seq.description,
    5.72 +        [(c.pdb_id, c.chain, c.begin, c.end) for c in pdb_codes]
    5.73 +    )
    5.74 +    aln.append_row_from_string(
    5.75 +        str(seq.seq), name=name, description=description)
    5.76 +    return aln.sequences[-1]
    5.77  
    5.78 -for aln in AlignIO.parse(infile, "stockholm"):
    5.79 -    out_aln = Align.MultipleSeqAlignment([], aln._alphabet)
    5.80 +def clean_aln(aln):
    5.81 +    aln.remove_gap_columns()
    5.82 +    aln.sequences = [
    5.83 +        sequence
    5.84 +        for sequence in aln.sequences
    5.85 +        if len(sequence) >= options.min_width
    5.86 +    ]
    5.87 +    assert len(aln.sequences) >= options.min_depth, "Alignment too thin"
    5.88 +
    5.89 +def output_aln(aln):
    5.90 +    filename = "pfam_%s_%s.fasta" % (aln.ac, aln.id)
    5.91 +    with open(filename, "w") as outfile:
    5.92 +        aln.to_file(outfile, format="fasta")
    5.93 +        print "Wrote %s" % filename
    5.94 +
    5.95 +def process_aln(aln):
    5.96 +    out_aln = protein.Alignment()
    5.97 +    out_aln.ac = aln.ac
    5.98 +    out_aln.id = aln.id
    5.99      for seq in aln:
   5.100 -        pdbcodes = [xref for xref in seq.dbxrefs if 'PDB' in xref]
   5.101 -        if pdbcodes != []:
   5.102 -            pdb_id, pdb_chain = xref.split()[1:3]
   5.103 -            seq.description = "%s %s %s" % (seq.id, seq.description, pdbcodes)
   5.104 -            seq.id = "%s_%s" % (pdb_id, pdb_chain.rstrip(";"))
   5.105 -            out_aln.append(seq)
   5.106 -    if len(out_aln) == 0:
   5.107 -        print "Skipping empty alignment"
   5.108 -        continue
   5.109 -    outfile_name = aln_name(aln)
   5.110 -    print "Writing %s sequences to %s" % (len(out_aln), outfile_name)
   5.111 -    AlignIO.write(out_aln, outfile_name, "fasta")
   5.112 +        try:
   5.113 +            pdb_codes = parse_pdb_codes(seq)
   5.114 +            seq = append_seq(out_aln, seq, pdb_codes)
   5.115 +        except AssertionError:
   5.116 +            pass
   5.117 +    clean_aln(out_aln)
   5.118 +    output_aln(out_aln)
   5.119 +
   5.120 +def process_pfam():
   5.121 +    if not os.path.exists(options.outdir_name):
   5.122 +        os.mkdir(options.outdir_name)
   5.123 +    os.chdir(options.outdir_name)
   5.124 +    pfam = gzip.open(options.infile_name)
   5.125 +    for aln in AlignIO.parse(pfam, "stockholm"):
   5.126 +        try:
   5.127 +            process_aln(aln)
   5.128 +        except AssertionError, e:
   5.129 +            print "Skipping %s %s : %s" % (aln.ac, aln.id, e)
   5.130 +
   5.131 +# Hack Biopython so that it contains some info from GF fields of stockholm
   5.132 +# Class MultipleSeqAlignment is only created in StockholmIO once: when
   5.133 +# parsing alignment file; we know locals() of function it is called from
   5.134 +# variable 'gf' there contains dictionary of GF fields in the file
   5.135 +class MultipleSeqAlignment(Align.MultipleSeqAlignment):
   5.136 +    def __init__(self, *args, **kw):
   5.137 +        f_locals = inspect.stack()[1][0].f_locals
   5.138 +        self.gf = f_locals['gf']
   5.139 +        self.ac, = self.gf['AC']
   5.140 +        self.id, = self.gf['ID']
   5.141 +        Align.MultipleSeqAlignment.__init__(self, *args, **kw)
   5.142 +AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment
   5.143 +
   5.144 +if __name__ == "__main__":
   5.145 +    process_pfam()
   5.146  
   5.147  # vim: set ts=4 sts=4 sw=4 et: