allpy
changeset 531:beae163bfdb9
Clean rewrite of mkcodes.py (helper script for allpy/data)
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Mon, 28 Feb 2011 19:11:23 +0300 |
parents | 4fb490da2736 |
children | 86a633cfcf3c |
files | allpy/data/codes_template.txt allpy/data/mkcodes.py |
diffstat | 2 files changed, 122 insertions(+), 100 deletions(-) [+] |
line diff
1.1 --- a/allpy/data/codes_template.txt Mon Feb 28 15:16:27 2011 +0300 1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 1.3 @@ -1,19 +0,0 @@ 1.4 -"""Tables of monomer codes. 1.5 - 1.6 -`dna`, `rna`, `protein` are lists of all known codes for monomers of given 1.7 -type. Each of them is a list of tuples of kind: 1.8 - 1.9 - ( 1-letter code, is-modified?, 3-letter-code, fullname ) 1.10 - 1.11 -`3-letter-code` is the code used in PDB (it may actually be one or 1.12 -two letters) 1.13 - 1.14 -""" 1.15 - 1.16 -protein = %(protein)s 1.17 - 1.18 -dna = %(dna)s 1.19 - 1.20 -rna = %(rna)s 1.21 - 1.22 -# vim: set et ts=4 sts=4 sw=4:
2.1 --- a/allpy/data/mkcodes.py Mon Feb 28 15:16:27 2011 +0300 2.2 +++ b/allpy/data/mkcodes.py Mon Feb 28 19:11:23 2011 +0300 2.3 @@ -1,100 +1,141 @@ 2.4 #!/usr/bin/python 2.5 +"""Parse components.cif for table of 1- and 3-letter monomer codes. 2.6 2.7 +Produce codes.py. 2.8 +""" 2.9 import os 2.10 -import argparse 2.11 -from pprint import pformat 2.12 2.13 -def rel(*x): 2.14 - return os.path.join(os.path.abspath(os.path.dirname(__file__)), *x) 2.15 +self_path = os.path.abspath(os.path.dirname(__file__)) 2.16 +components_cif_path = os.path.join(self_path, 'components.cif') 2.17 +codes_py_path = os.path.join(self_path, 'codes.py') 2.18 2.19 -p = argparse.ArgumentParser( 2.20 -description='Components.cif to codes.py converter', 2.21 -epilog='', 2.22 -formatter_class=argparse.ArgumentDefaultsHelpFormatter, 2.23 +file_template = '''"""Tables of monomer codes. 2.24 + 2.25 +`dna`, `rna`, `protein` are lists of all known codes for monomers of given 2.26 +type. Each of them is a list of tuples of kind: 2.27 + 2.28 + ( 1-letter code, is-modified?, 3-letter-code, fullname ) 2.29 + 2.30 +`3-letter-code` is the code used in PDB (it may actually be one or 2.31 +two letters) 2.32 + 2.33 +""" 2.34 + 2.35 +dna = ( 2.36 +%(dna)s 2.37 ) 2.38 2.39 -r = argparse.FileType('r') 2.40 -w = argparse.FileType('w') 2.41 +protein = ( 2.42 +%(protein)s 2.43 +) 2.44 2.45 -p.add_argument('-v','--version',action='version',version='%(prog)s 1.0') 2.46 -p.add_argument('-i',help='input components.cif',metavar='FILE',type=r, 2.47 - required=True) 2.48 -p.add_argument('-o',help='output codes.py',metavar='FILE',type=w, 2.49 - default=rel('codes.py')) 2.50 -p.add_argument('-t',help='Template for codes.py',metavar='FILE',type=r, 2.51 - default=rel('codes_template.txt')) 2.52 +rna = ( 2.53 +%(rna)s 2.54 +) 2.55 +''' 2.56 2.57 -try: 2.58 - args = p.parse_args() 2.59 -except Exception, t: 2.60 - print t 2.61 - exit() 2.62 +entry_template = """('%(code1)s', %(is_modified)s, "%(code3)s", "%(name)s"),""" 2.63 2.64 -AAbank = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C', 2.65 - 'GLN':'Q', 'GLU':'E', 'GLY':'G', 'HIS':'H', 'ILE':'I', 2.66 - 'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P', 2.67 - 'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V', 2.68 - 'DA' :'A', 'DT' :'T', 'DG' :'G', 'DC' :'C', 2.69 - 'A': 'A', 'U' :'U', 'G' :'G', 'C' :'C'} 2.70 +types = { 2.71 + 'protein': [], 2.72 + 'dna': [], 2.73 + 'rna': [], 2.74 +} 2.75 2.76 -protein = [] 2.77 -dna = [] 2.78 -rna = [] 2.79 +unknown_code1 = { 2.80 + 'protein': 'X', 2.81 + 'dna': 'N', 2.82 + 'rna': 'N', 2.83 +} 2.84 2.85 -def process_cif_entry(cif_entry): 2.86 - monomer_type = cif_entry['_chem_comp.type'].strip() 2.87 - if "PEPTIDE" in monomer_type: 2.88 - container = protein 2.89 - elif "DNA" in monomer_type: 2.90 - container = dna 2.91 - elif "RNA" in monomer_type: 2.92 - container = rna 2.93 +unmodified_codes3 = set(( 2.94 + "DA", "DC", "DG", "DT", 2.95 + "ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU", 2.96 + "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR", 2.97 + "A", "C", "G", "U", 2.98 +)) 2.99 + 2.100 +def main(options): 2.101 + parse_cif(options.input_cif, process_entry) 2.102 + add_entry(type='protein', code1='X', is_modified=False, 2.103 + code3="", name="Undefined Aminoacid") 2.104 + add_entry(type='dna', code1='N', is_modified=False, 2.105 + code3="", name="Undefined DNA Nucelotide") 2.106 + add_entry(type='rna', code1='N', is_modified=False, 2.107 + code3="", name="Undefined RNA Nucelotide") 2.108 + write_codes_py(options.output_py) 2.109 + 2.110 +def add_entry(**kw): 2.111 + types[kw['type']].append(entry_template % kw) 2.112 + 2.113 +def write_codes_py(file): 2.114 + data = {} 2.115 + for type, entries in types.items(): 2.116 + data[type] = '\n'.join(sorted(entries)) 2.117 + file.write(file_template % data) 2.118 + 2.119 +def process_entry(entry): 2.120 + monomer_type = entry['_chem_comp.type'].lower() 2.121 + code1 = entry['_chem_comp.one_letter_code'].upper() 2.122 + code3 = entry['_chem_comp.three_letter_code'].upper() 2.123 + name = entry.get('_chem_comp.name', '').upper().replace('"', "''") 2.124 + is_modified = True 2.125 + 2.126 + if 'peptide' in monomer_type: 2.127 + type = 'protein' 2.128 + elif 'dna' in monomer_type: 2.129 + type = 'dna' 2.130 + elif 'rna' in monomer_type: 2.131 + type = 'rna' 2.132 else: 2.133 return 2.134 - code1 = cif_entry['_chem_comp.one_letter_code'].strip() 2.135 - if code1 == '?': 2.136 - parent = cif_entry['_chem_comp.mon_nstd_parent_comp_id'] 2.137 - parent = parent.strip().upper() 2.138 - if parent in AAbank: 2.139 - code1 = AAbank[parent].lower() 2.140 - else: 2.141 - if container == protein: 2.142 - code1 = 'x' 2.143 - else: 2.144 - code1 = 'n' 2.145 - code3 = cif_entry['_chem_comp.three_letter_code'].strip().upper() 2.146 - name = cif_entry['_chem_comp.name'].strip() 2.147 - if name[0] == '"' and name[-1] == '"': 2.148 - name = name[1:-1] 2.149 - modified = code3 not in AAbank 2.150 - code1 = code1.lower() if modified else code1.upper() 2.151 - container.append((code1, modified, code3, name)) 2.152 2.153 -cif_entry = {} 2.154 + if code1 == "?": 2.155 + code1 = unknown_code1[type] 2.156 2.157 -for line in args.i: 2.158 - line = line.strip() 2.159 - if 'data_' in line and cif_entry: 2.160 - try: 2.161 - process_cif_entry(cif_entry) 2.162 - except: 2.163 - pass 2.164 - cif_entry = {} 2.165 - else: 2.166 - key_value = line.split(' ', 1) 2.167 - if len(key_value) == 2: 2.168 - key, value = key_value 2.169 - cif_entry[key] = value 2.170 + if code3 in unmodified_codes3: 2.171 + is_modified = False 2.172 2.173 -protein.append(('X', False, "", "Undefined Aminoacid")) 2.174 -dna.append(('N', False, "", "Undefined DNA Nucelotide")) 2.175 -rna.append(('N', False, "", "Undefined RNA Nucelotide")) 2.176 + if is_modified: 2.177 + code1 = code1.lower() 2.178 2.179 -protein.sort() 2.180 -dna.sort() 2.181 -rna.sort() 2.182 + add_entry(**locals()) 2.183 2.184 -template = args.t.read() 2.185 -args.o.write(template % {'protein': pformat(protein, width=1024), 2.186 - 'dna': pformat(dna, width=1024), 2.187 - 'rna': pformat(rna, width=1024)}) 2.188 +def parse_cif(file, callback): 2.189 + entry = None 2.190 + key = None 2.191 + for line_no, line in enumerate(file, 1): 2.192 + line = line.strip() 2.193 + if line.startswith("data_"): 2.194 + if entry is not None: 2.195 + callback(entry) 2.196 + entry = {} 2.197 + elif line.startswith("_"): 2.198 + line_split = line.split(" ", 1) 2.199 + key = line_split[0].strip().lower() 2.200 + entry[key] = "" 2.201 + if " " in line: 2.202 + value = line_split[1].strip().strip('\'\"') 2.203 + entry[key] = value 2.204 + elif not line.startswith("#"): 2.205 + entry[key] += line.lstrip(";").strip().strip('\'\"') 2.206 + callback(entry) 2.207 + 2.208 +if __name__ == "__main__": 2.209 + import optparse 2.210 + parser = optparse.OptionParser() 2.211 + parser.add_option('-i', '--input-cif', 2.212 + help="input components.cif or aa-variants.cif", 2.213 + default=components_cif_path) 2.214 + parser.add_option('-o', '--output-py', 2.215 + help="output codes.py file", 2.216 + default=codes_py_path) 2.217 + options, args = parser.parse_args() 2.218 + 2.219 + try: 2.220 + assert args == [], "No positional arguments are accepted" 2.221 + options.input_cif = open(options.input_cif) 2.222 + options.output_py = open(options.output_py, "w") 2.223 + main(options) 2.224 + except TypeError, e: 2.225 + parser.error(e)