allpy

changeset 531:beae163bfdb9
Clean rewrite of mkcodes.py (helper script for allpy/data)
author: Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date: Mon, 28 Feb 2011 19:11:23 +0300
parents: 4fb490da2736
children: 86a633cfcf3c
files: allpy/data/codes_template.txt allpy/data/mkcodes.py
diffstat: 2 files changed, 122 insertions(+), 100 deletions(-) [+]
[-]

allpy/data/codes_template.txt 19

allpy/data/mkcodes.py 203 allpy/data/codes_template.txt 19 allpy/data/mkcodes.py 203
allpy/data/codes_template.txt 19
allpy/data/mkcodes.py 203
     1.1 --- a/allpy/data/codes_template.txt	Mon Feb 28 15:16:27 2011 +0300
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,19 +0,0 @@
     1.4 -"""Tables of monomer codes.
     1.5 -
     1.6 -`dna`, `rna`, `protein` are lists of all known codes for monomers of given
     1.7 -type. Each of them is a list of tuples of kind:
     1.8 -
     1.9 -    ( 1-letter code, is-modified?, 3-letter-code, fullname )
    1.10 -
    1.11 -`3-letter-code` is the code used in PDB (it may actually be one or
    1.12 -two letters)
    1.13 -
    1.14 -"""
    1.15 -
    1.16 -protein = %(protein)s
    1.17 -
    1.18 -dna = %(dna)s
    1.19 -
    1.20 -rna = %(rna)s
    1.21 -
    1.22 -# vim: set et ts=4 sts=4 sw=4:

     2.1 --- a/allpy/data/mkcodes.py	Mon Feb 28 15:16:27 2011 +0300
     2.2 +++ b/allpy/data/mkcodes.py	Mon Feb 28 19:11:23 2011 +0300
     2.3 @@ -1,100 +1,141 @@
     2.4  #!/usr/bin/python
     2.5 +"""Parse components.cif for table of 1- and 3-letter monomer codes.
     2.6  
     2.7 +Produce codes.py.
     2.8 +"""
     2.9  import os
    2.10 -import argparse
    2.11 -from pprint import pformat
    2.12  
    2.13 -def rel(*x):
    2.14 -    return os.path.join(os.path.abspath(os.path.dirname(__file__)), *x)
    2.15 +self_path = os.path.abspath(os.path.dirname(__file__))
    2.16 +components_cif_path = os.path.join(self_path, 'components.cif')
    2.17 +codes_py_path = os.path.join(self_path, 'codes.py')
    2.18  
    2.19 -p = argparse.ArgumentParser(
    2.20 -description='Components.cif to codes.py converter',
    2.21 -epilog='',
    2.22 -formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    2.23 +file_template = '''"""Tables of monomer codes.
    2.24 +
    2.25 +`dna`, `rna`, `protein` are lists of all known codes for monomers of given
    2.26 +type. Each of them is a list of tuples of kind:
    2.27 +
    2.28 +    ( 1-letter code, is-modified?, 3-letter-code, fullname )
    2.29 +
    2.30 +`3-letter-code` is the code used in PDB (it may actually be one or
    2.31 +two letters)
    2.32 +
    2.33 +"""
    2.34 +
    2.35 +dna = (
    2.36 +%(dna)s
    2.37  )
    2.38  
    2.39 -r = argparse.FileType('r')
    2.40 -w = argparse.FileType('w')
    2.41 +protein = (
    2.42 +%(protein)s
    2.43 +)
    2.44  
    2.45 -p.add_argument('-v','--version',action='version',version='%(prog)s 1.0')
    2.46 -p.add_argument('-i',help='input components.cif',metavar='FILE',type=r,
    2.47 -    required=True)
    2.48 -p.add_argument('-o',help='output codes.py',metavar='FILE',type=w,
    2.49 -    default=rel('codes.py'))
    2.50 -p.add_argument('-t',help='Template for codes.py',metavar='FILE',type=r,
    2.51 -    default=rel('codes_template.txt'))
    2.52 +rna = (
    2.53 +%(rna)s
    2.54 +)
    2.55 +'''
    2.56  
    2.57 -try:
    2.58 -    args = p.parse_args()
    2.59 -except Exception, t:
    2.60 -    print t
    2.61 -    exit()
    2.62 +entry_template = """('%(code1)s', %(is_modified)s, "%(code3)s", "%(name)s"),"""
    2.63  
    2.64 -AAbank = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C',
    2.65 -        'GLN':'Q', 'GLU':'E', 'GLY':'G', 'HIS':'H', 'ILE':'I',
    2.66 -        'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P',
    2.67 -        'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V',
    2.68 -        'DA' :'A', 'DT' :'T', 'DG' :'G', 'DC' :'C',
    2.69 -        'A':  'A', 'U'  :'U', 'G'  :'G', 'C'  :'C'}
    2.70 +types = {
    2.71 +    'protein': [],
    2.72 +    'dna': [],
    2.73 +    'rna': [],
    2.74 +}
    2.75  
    2.76 -protein = []
    2.77 -dna = []
    2.78 -rna = []
    2.79 +unknown_code1 = {
    2.80 +    'protein': 'X',
    2.81 +    'dna': 'N',
    2.82 +    'rna': 'N',
    2.83 +}
    2.84  
    2.85 -def process_cif_entry(cif_entry):
    2.86 -    monomer_type = cif_entry['_chem_comp.type'].strip()
    2.87 -    if "PEPTIDE" in monomer_type:
    2.88 -        container = protein
    2.89 -    elif "DNA" in monomer_type:
    2.90 -        container = dna
    2.91 -    elif "RNA" in monomer_type:
    2.92 -        container = rna
    2.93 +unmodified_codes3 = set((
    2.94 +    "DA", "DC", "DG", "DT",
    2.95 +    "ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU",
    2.96 +    "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR",
    2.97 +    "A", "C", "G", "U",
    2.98 +))
    2.99 +
   2.100 +def main(options):
   2.101 +    parse_cif(options.input_cif, process_entry)
   2.102 +    add_entry(type='protein', code1='X', is_modified=False,
   2.103 +        code3="", name="Undefined Aminoacid")
   2.104 +    add_entry(type='dna', code1='N', is_modified=False,
   2.105 +        code3="", name="Undefined DNA Nucelotide")
   2.106 +    add_entry(type='rna', code1='N', is_modified=False,
   2.107 +        code3="", name="Undefined RNA Nucelotide")
   2.108 +    write_codes_py(options.output_py)
   2.109 +
   2.110 +def add_entry(**kw):
   2.111 +    types[kw['type']].append(entry_template % kw)
   2.112 +
   2.113 +def write_codes_py(file):
   2.114 +    data = {}
   2.115 +    for type, entries in types.items():
   2.116 +        data[type] = '\n'.join(sorted(entries))
   2.117 +    file.write(file_template % data)
   2.118 +
   2.119 +def process_entry(entry):
   2.120 +    monomer_type = entry['_chem_comp.type'].lower()
   2.121 +    code1 = entry['_chem_comp.one_letter_code'].upper()
   2.122 +    code3 = entry['_chem_comp.three_letter_code'].upper()
   2.123 +    name = entry.get('_chem_comp.name', '').upper().replace('"', "''")
   2.124 +    is_modified = True
   2.125 +
   2.126 +    if 'peptide' in monomer_type:
   2.127 +        type = 'protein'
   2.128 +    elif 'dna' in monomer_type:
   2.129 +        type = 'dna'
   2.130 +    elif 'rna' in monomer_type:
   2.131 +        type = 'rna'
   2.132      else:
   2.133          return
   2.134 -    code1 = cif_entry['_chem_comp.one_letter_code'].strip()
   2.135 -    if code1 == '?':
   2.136 -        parent = cif_entry['_chem_comp.mon_nstd_parent_comp_id']
   2.137 -        parent = parent.strip().upper()
   2.138 -        if parent in AAbank:
   2.139 -            code1 = AAbank[parent].lower()
   2.140 -        else:
   2.141 -            if container == protein:
   2.142 -                code1 = 'x'
   2.143 -            else:
   2.144 -                code1 = 'n'
   2.145 -    code3 = cif_entry['_chem_comp.three_letter_code'].strip().upper()
   2.146 -    name = cif_entry['_chem_comp.name'].strip()
   2.147 -    if name[0] == '"' and name[-1] == '"':
   2.148 -        name = name[1:-1]
   2.149 -    modified = code3 not in AAbank
   2.150 -    code1 = code1.lower() if modified else code1.upper()
   2.151 -    container.append((code1, modified, code3, name))
   2.152  
   2.153 -cif_entry = {}
   2.154 +    if code1 == "?":
   2.155 +        code1 = unknown_code1[type]
   2.156  
   2.157 -for line in args.i:
   2.158 -    line = line.strip()
   2.159 -    if 'data_' in line and cif_entry:
   2.160 -        try:
   2.161 -            process_cif_entry(cif_entry)
   2.162 -        except:
   2.163 -            pass
   2.164 -        cif_entry = {}
   2.165 -    else:
   2.166 -        key_value = line.split(' ', 1)
   2.167 -        if len(key_value) == 2:
   2.168 -            key, value = key_value
   2.169 -            cif_entry[key] = value
   2.170 +    if code3 in unmodified_codes3:
   2.171 +        is_modified = False
   2.172  
   2.173 -protein.append(('X', False, "", "Undefined Aminoacid"))
   2.174 -dna.append(('N', False, "", "Undefined DNA Nucelotide"))
   2.175 -rna.append(('N', False, "", "Undefined RNA Nucelotide"))
   2.176 +    if is_modified:
   2.177 +        code1 = code1.lower()
   2.178  
   2.179 -protein.sort()
   2.180 -dna.sort()
   2.181 -rna.sort()
   2.182 +    add_entry(**locals())
   2.183  
   2.184 -template = args.t.read()
   2.185 -args.o.write(template % {'protein': pformat(protein, width=1024),
   2.186 -    'dna': pformat(dna, width=1024),
   2.187 -    'rna': pformat(rna, width=1024)})
   2.188 +def parse_cif(file, callback):
   2.189 +    entry = None
   2.190 +    key = None
   2.191 +    for line_no, line in enumerate(file, 1):
   2.192 +        line = line.strip()
   2.193 +        if line.startswith("data_"):
   2.194 +            if entry is not None:
   2.195 +                callback(entry)
   2.196 +            entry = {}
   2.197 +        elif line.startswith("_"):
   2.198 +            line_split = line.split(" ", 1)
   2.199 +            key = line_split[0].strip().lower()
   2.200 +            entry[key] = ""
   2.201 +            if " " in line:
   2.202 +                value = line_split[1].strip().strip('\'\"')
   2.203 +                entry[key] = value
   2.204 +        elif not line.startswith("#"):
   2.205 +            entry[key] += line.lstrip(";").strip().strip('\'\"')
   2.206 +    callback(entry)
   2.207 +
   2.208 +if __name__ == "__main__":
   2.209 +    import optparse
   2.210 +    parser = optparse.OptionParser()
   2.211 +    parser.add_option('-i', '--input-cif',
   2.212 +        help="input components.cif or aa-variants.cif",
   2.213 +        default=components_cif_path)
   2.214 +    parser.add_option('-o', '--output-py',
   2.215 +        help="output codes.py file",
   2.216 +        default=codes_py_path)
   2.217 +    options, args = parser.parse_args()
   2.218 +
   2.219 +    try:
   2.220 +        assert args == [], "No positional arguments are accepted"
   2.221 +        options.input_cif = open(options.input_cif)
   2.222 +        options.output_py = open(options.output_py, "w")
   2.223 +        main(options)
   2.224 +    except TypeError, e:
   2.225 +        parser.error(e)