allpy
changeset 718:49d1ba7072a8
Restructured rewrite of extract_pfam.py
- every small step is a function now
- it extracts pdb residue range information
- it handles negative residue numbers well
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Fri, 08 Jul 2011 14:47:23 +0400 |
parents | 0377cd9ac4e6 |
children | 3bb5c96d02b1 |
files | utils/extract_pfam.py |
diffstat | 1 files changed, 91 insertions(+), 40 deletions(-) [+] |
line diff
1.1 --- a/utils/extract_pfam.py Fri Jul 08 12:09:39 2011 +0400 1.2 +++ b/utils/extract_pfam.py Fri Jul 08 14:47:23 2011 +0400 1.3 @@ -1,53 +1,104 @@ 1.4 #!/usr/bin/python 1.5 -import optparse 1.6 import gzip 1.7 -import itertools 1.8 import inspect 1.9 +import os 1.10 +import re 1.11 from Bio import AlignIO, Align 1.12 +from allpy import protein 1.13 1.14 -name_by = "ac-id" 1.15 +class Struct(object): 1.16 + pass 1.17 1.18 -parser = optparse.OptionParser() 1.19 -options, args = parser.parse_args() 1.20 +options = Struct() 1.21 +options.min_depth = 2 1.22 +options.min_width = 20 1.23 +options.infile_name = '/srv/databases/pfam/Pfam-A.full.gz' 1.24 +options.outdir_name = '/srv/databases/pfam/test' 1.25 1.26 -infile_name, = args 1.27 -infile = gzip.open(infile_name) 1.28 +re_range = re.compile(r'(-?\d+)-(-?\d+)') 1.29 +def parse_dbxref(dbxref): 1.30 + db, pdb_chain, range, null = (part.strip() for part in dbxref.split(";")) 1.31 + assert db == "PDB", "DBxref not for PDB" 1.32 + assert null.strip() == "", "Too many fields in GF DB PDB" 1.33 + result = Struct() 1.34 + result.pdb_id, result.chain = pdb_chain.split() 1.35 + range = re_range.match(range) 1.36 + result.begin = int(range.group(1)) 1.37 + result.end = int(range.group(2)) 1.38 + result.len = result.end - result.begin 1.39 + return result 1.40 1.41 -if name_by == "number": 1.42 - numbers = iter(itertools.count(1)) 1.43 - def aln_name(aln): 1.44 - return "pfam_%05d.fasta" % numbers.next() 1.45 -elif name_by == "ac-id": 1.46 - # Hack Biopython so that it contains some info from GF fields of stockholm 1.47 - # Class MultipleSeqAlignment is only created in StockholmIO once: when 1.48 - # parsing alignment file; we know locals() of function it is called from 1.49 - # variable 'gf' there contains dictionary of GF fields in the file 1.50 - class MultipleSeqAlignment(Align.MultipleSeqAlignment): 1.51 - def __init__(self, *args, **kw): 1.52 - f_locals = inspect.stack()[1][0].f_locals 1.53 - self.gf = f_locals['gf'] 1.54 - self.ac, = self.gf['AC'] 1.55 - self.id, = self.gf['ID'] 1.56 - Align.MultipleSeqAlignment.__init__(self, *args, **kw) 1.57 - AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment 1.58 +def parse_pdb_codes(seq): 1.59 + codes = [parse_dbxref(dbxref) 1.60 + for dbxref in seq.dbxrefs if 'PDB' in dbxref] 1.61 + assert codes, "No PDB data" 1.62 + return list(reversed(sorted(codes, key=lambda r: r.len))) 1.63 1.64 - def aln_name(aln): 1.65 - return "pfam_%s_%s.fasta" % (aln.ac, aln.id) 1.66 +def append_seq(aln, seq, pdb_codes): 1.67 + code = pdb_codes[0] 1.68 + name = '%s_%s' % (code.pdb_id, code.chain) 1.69 + description = '%s %s %s' % ( 1.70 + seq.id, 1.71 + seq.description, 1.72 + [(c.pdb_id, c.chain, c.begin, c.end) for c in pdb_codes] 1.73 + ) 1.74 + aln.append_row_from_string( 1.75 + str(seq.seq), name=name, description=description) 1.76 + return aln.sequences[-1] 1.77 1.78 -for aln in AlignIO.parse(infile, "stockholm"): 1.79 - out_aln = Align.MultipleSeqAlignment([], aln._alphabet) 1.80 +def clean_aln(aln): 1.81 + aln.remove_gap_columns() 1.82 + aln.sequences = [ 1.83 + sequence 1.84 + for sequence in aln.sequences 1.85 + if len(sequence) >= options.min_width 1.86 + ] 1.87 + assert len(aln.sequences) >= options.min_depth, "Alignment too thin" 1.88 + 1.89 +def output_aln(aln): 1.90 + filename = "pfam_%s_%s.fasta" % (aln.ac, aln.id) 1.91 + with open(filename, "w") as outfile: 1.92 + aln.to_file(outfile, format="fasta") 1.93 + print "Wrote %s" % filename 1.94 + 1.95 +def process_aln(aln): 1.96 + out_aln = protein.Alignment() 1.97 + out_aln.ac = aln.ac 1.98 + out_aln.id = aln.id 1.99 for seq in aln: 1.100 - pdbcodes = [xref for xref in seq.dbxrefs if 'PDB' in xref] 1.101 - if pdbcodes != []: 1.102 - pdb_id, pdb_chain = xref.split()[1:3] 1.103 - seq.description = "%s %s %s" % (seq.id, seq.description, pdbcodes) 1.104 - seq.id = "%s_%s" % (pdb_id, pdb_chain.rstrip(";")) 1.105 - out_aln.append(seq) 1.106 - if len(out_aln) == 0: 1.107 - print "Skipping empty alignment" 1.108 - continue 1.109 - outfile_name = aln_name(aln) 1.110 - print "Writing %s sequences to %s" % (len(out_aln), outfile_name) 1.111 - AlignIO.write(out_aln, outfile_name, "fasta") 1.112 + try: 1.113 + pdb_codes = parse_pdb_codes(seq) 1.114 + seq = append_seq(out_aln, seq, pdb_codes) 1.115 + except AssertionError: 1.116 + pass 1.117 + clean_aln(out_aln) 1.118 + output_aln(out_aln) 1.119 + 1.120 +def process_pfam(): 1.121 + if not os.path.exists(options.outdir_name): 1.122 + os.mkdir(options.outdir_name) 1.123 + os.chdir(options.outdir_name) 1.124 + pfam = gzip.open(options.infile_name) 1.125 + for aln in AlignIO.parse(pfam, "stockholm"): 1.126 + try: 1.127 + process_aln(aln) 1.128 + except AssertionError, e: 1.129 + print "Skipping %s %s : %s" % (aln.ac, aln.id, e) 1.130 + 1.131 +# Hack Biopython so that it contains some info from GF fields of stockholm 1.132 +# Class MultipleSeqAlignment is only created in StockholmIO once: when 1.133 +# parsing alignment file; we know locals() of function it is called from 1.134 +# variable 'gf' there contains dictionary of GF fields in the file 1.135 +class MultipleSeqAlignment(Align.MultipleSeqAlignment): 1.136 + def __init__(self, *args, **kw): 1.137 + f_locals = inspect.stack()[1][0].f_locals 1.138 + self.gf = f_locals['gf'] 1.139 + self.ac, = self.gf['AC'] 1.140 + self.id, = self.gf['ID'] 1.141 + Align.MultipleSeqAlignment.__init__(self, *args, **kw) 1.142 +AlignIO.StockholmIO.MultipleSeqAlignment = MultipleSeqAlignment 1.143 + 1.144 +if __name__ == "__main__": 1.145 + process_pfam() 1.146 1.147 # vim: set ts=4 sts=4 sw=4 et: