allpy
changeset 726:dc3e9ff89ab5
Merge
author | Boris Burkov <BurkovBA@gmail.com> |
---|---|
date | Fri, 08 Jul 2011 18:38:31 +0400 |
parents | 0a7917b4ca71 890fc647f69c |
children | 633a961e7586 3c09dabe0e96 |
files | |
diffstat | 5 files changed, 122 insertions(+), 40 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/docs/source/allpy/data.monomers.rst Fri Jul 08 18:38:31 2011 +0400 1.3 @@ -0,0 +1,5 @@ 1.4 +Monomers Documentation 1.5 +====================== 1.6 + 1.7 +This page contains the Monomers Package documentation. 1.8 +
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/docs/source/allpy/homology.rst Fri Jul 08 18:38:31 2011 +0400 2.3 @@ -0,0 +1,12 @@ 2.4 +Homology Documentation 2.5 +====================== 2.6 + 2.7 +This page contains the Homology Module documentation. 2.8 + 2.9 +The :mod:`homology` Module 2.10 +-------------------------- 2.11 + 2.12 +.. automodule:: allpy.homology 2.13 + :members: 2.14 + :undoc-members: 2.15 + :show-inheritance:
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/docs/source/allpy/markups.rst Fri Jul 08 18:38:31 2011 +0400 3.3 @@ -0,0 +1,12 @@ 3.4 +Markups Documentation 3.5 +===================== 3.6 + 3.7 +This page contains the Markups Module documentation. 3.8 + 3.9 +The :mod:`markups` Module 3.10 +------------------------- 3.11 + 3.12 +.. automodule:: allpy.markups 3.13 + :members: 3.14 + :undoc-members: 3.15 + :show-inheritance:
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/test/Makefile Fri Jul 08 18:38:31 2011 +0400 4.3 @@ -0,0 +1,2 @@ 4.4 +all: 4.5 + $(MAKE) -C .. tests
5.1 --- a/utils/extract_pfam.py Fri Jul 08 18:30:04 2011 +0400 5.2 +++ b/utils/extract_pfam.py Fri Jul 08 18:38:31 2011 +0400 5.3 @@ -1,53 +1,104 @@ 5.4 #!/usr/bin/python 5.5 -import optparse 5.6 import gzip 5.7 -import itertools 5.8 import inspect 5.9 +import os 5.10 +import re 5.11 from Bio import AlignIO, Align 5.12 +from allpy import protein 5.13 5.14 -name_by = "ac-id" 5.15 +class Struct(object): 5.16 + pass 5.17 5.18 -parser = optparse.OptionParser() 5.19 -options, args = parser.parse_args() 5.20 +options = Struct() 5.21 +options.min_depth = 2 5.22 +options.min_width = 20 5.23 +options.infile_name = '/srv/databases/pfam/Pfam-A.full.gz' 5.24 +options.outdir_name = '/srv/databases/pfam/test' 5.25 5.26 -infile_name, = args 5.27 -infile = gzip.open(infile_name) 5.28 +re_range = re.compile(r'(-?\d+)-(-?\d+)') 5.29 +def parse_dbxref(dbxref): 5.30 + db, pdb_chain, range, null = (part.strip() for part in dbxref.split(";")) 5.31 + assert db == "PDB", "DBxref not for PDB" 5.32 + assert null.strip() == "", "Too many fields in GF DB PDB" 5.33 + result = Struct() 5.34 + result.pdb_id, result.chain = pdb_chain.split() 5.35 + range = re_range.match(range) 5.36 + result.begin = int(range.group(1)) 5.37 + result.end = int(range.group(2)) 5.38 + result.len = result.end - result.begin 5.39 + return result 5.40 5.41 -if name_by == "number": 5.42 - numbers = iter(itertools.count(1)) 5.43 - def aln_name(aln): 5.44 - return "pfam_%05d.fasta" % numbers.next() 5.45 -elif name_by == "ac-id": 5.46 - # Hack Biopython so that it contains some info from GF fields of stockholm 5.47 - # Class MultipleSeqAlignment is only created in StockholmIO once: when 5.48 - # parsing alignment file; we know locals() of function it is called from 5.49 - # variable 'gf' there contains dictionary of GF fields in the file 5.50 - class MultipleSeqAlignment(Align.MultipleSeqAlignment): 5.51 - def __init__(self, *args, **kw): 5.52 - f_locals = inspect.stack()[1][0].f_locals 5.53 - self.gf = f_locals['gf'] 5.54 - self.ac, = self.gf['AC'] 5.55 - self.id, = self.gf['ID'] 5.56 - Align.MultipleSeqAlignment.__init__(self, *args, **kw) 5.57 - AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment 5.58 +def parse_pdb_codes(seq): 5.59 + codes = [parse_dbxref(dbxref) 5.60 + for dbxref in seq.dbxrefs if 'PDB' in dbxref] 5.61 + assert codes, "No PDB data" 5.62 + return list(reversed(sorted(codes, key=lambda r: r.len))) 5.63 5.64 - def aln_name(aln): 5.65 - return "pfam_%s_%s.fasta" % (aln.ac, aln.id) 5.66 +def append_seq(aln, seq, pdb_codes): 5.67 + code = pdb_codes[0] 5.68 + name = '%s_%s' % (code.pdb_id, code.chain) 5.69 + description = '%s %s %s' % ( 5.70 + seq.id, 5.71 + seq.description, 5.72 + [(c.pdb_id, c.chain, c.begin, c.end) for c in pdb_codes] 5.73 + ) 5.74 + aln.append_row_from_string( 5.75 + str(seq.seq), name=name, description=description) 5.76 + return aln.sequences[-1] 5.77 5.78 -for aln in AlignIO.parse(infile, "stockholm"): 5.79 - out_aln = Align.MultipleSeqAlignment([], aln._alphabet) 5.80 +def clean_aln(aln): 5.81 + aln.remove_gap_columns() 5.82 + aln.sequences = [ 5.83 + sequence 5.84 + for sequence in aln.sequences 5.85 + if len(sequence) >= options.min_width 5.86 + ] 5.87 + assert len(aln.sequences) >= options.min_depth, "Alignment too thin" 5.88 + 5.89 +def output_aln(aln): 5.90 + filename = "pfam_%s_%s.fasta" % (aln.ac, aln.id) 5.91 + with open(filename, "w") as outfile: 5.92 + aln.to_file(outfile, format="fasta") 5.93 + print "Wrote %s" % filename 5.94 + 5.95 +def process_aln(aln): 5.96 + out_aln = protein.Alignment() 5.97 + out_aln.ac = aln.ac 5.98 + out_aln.id = aln.id 5.99 for seq in aln: 5.100 - pdbcodes = [xref for xref in seq.dbxrefs if 'PDB' in xref] 5.101 - if pdbcodes != []: 5.102 - pdb_id, pdb_chain = xref.split()[1:3] 5.103 - seq.description = "%s %s %s" % (seq.id, seq.description, pdbcodes) 5.104 - seq.id = "%s_%s" % (pdb_id, pdb_chain.rstrip(";")) 5.105 - out_aln.append(seq) 5.106 - if len(out_aln) == 0: 5.107 - print "Skipping empty alignment" 5.108 - continue 5.109 - outfile_name = aln_name(aln) 5.110 - print "Writing %s sequences to %s" % (len(out_aln), outfile_name) 5.111 - AlignIO.write(out_aln, outfile_name, "fasta") 5.112 + try: 5.113 + pdb_codes = parse_pdb_codes(seq) 5.114 + seq = append_seq(out_aln, seq, pdb_codes) 5.115 + except AssertionError: 5.116 + pass 5.117 + clean_aln(out_aln) 5.118 + output_aln(out_aln) 5.119 + 5.120 +def process_pfam(): 5.121 + if not os.path.exists(options.outdir_name): 5.122 + os.mkdir(options.outdir_name) 5.123 + os.chdir(options.outdir_name) 5.124 + pfam = gzip.open(options.infile_name) 5.125 + for aln in AlignIO.parse(pfam, "stockholm"): 5.126 + try: 5.127 + process_aln(aln) 5.128 + except AssertionError, e: 5.129 + print "Skipping %s %s : %s" % (aln.ac, aln.id, e) 5.130 + 5.131 +# Hack Biopython so that it contains some info from GF fields of stockholm 5.132 +# Class MultipleSeqAlignment is only created in StockholmIO once: when 5.133 +# parsing alignment file; we know locals() of function it is called from 5.134 +# variable 'gf' there contains dictionary of GF fields in the file 5.135 +class MultipleSeqAlignment(Align.MultipleSeqAlignment): 5.136 + def __init__(self, *args, **kw): 5.137 + f_locals = inspect.stack()[1][0].f_locals 5.138 + self.gf = f_locals['gf'] 5.139 + self.ac, = self.gf['AC'] 5.140 + self.id, = self.gf['ID'] 5.141 + Align.MultipleSeqAlignment.__init__(self, *args, **kw) 5.142 +AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment 5.143 + 5.144 +if __name__ == "__main__": 5.145 + process_pfam() 5.146 5.147 # vim: set ts=4 sts=4 sw=4 et: