Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/beae163bfdb9
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 07:55:07 2012
Кодировка:

# HG changeset patch
# User Daniil Alexeyevsky
# Date 1298909483 -10800
# Node ID beae163bfdb94a68fd65ba741c72b859863d2349
# Parent 4fb490da27368ab7e37fce065e8ab386e82f15fc
Clean rewrite of mkcodes.py (helper script for allpy/data)

diff -r 4fb490da2736 -r beae163bfdb9 allpy/data/codes_template.txt
--- a/allpy/data/codes_template.txt Mon Feb 28 15:16:27 2011 +0300
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-"""Tables of monomer codes.
-
-`dna`, `rna`, `protein` are lists of all known codes for monomers of given
-type. Each of them is a list of tuples of kind:
-
- ( 1-letter code, is-modified?, 3-letter-code, fullname )
-
-`3-letter-code` is the code used in PDB (it may actually be one or
-two letters)
-
-"""
-
-protein = %(protein)s
-
-dna = %(dna)s
-
-rna = %(rna)s
-
-# vim: set et ts=4 sts=4 sw=4:
diff -r 4fb490da2736 -r beae163bfdb9 allpy/data/mkcodes.py
--- a/allpy/data/mkcodes.py Mon Feb 28 15:16:27 2011 +0300
+++ b/allpy/data/mkcodes.py Mon Feb 28 19:11:23 2011 +0300
@@ -1,100 +1,141 @@
#!/usr/bin/python
+"""Parse components.cif for table of 1- and 3-letter monomer codes.

+Produce codes.py.
+"""
import os
-import argparse
-from pprint import pformat

-def rel(*x):
- return os.path.join(os.path.abspath(os.path.dirname(__file__)), *x)
+self_path = os.path.abspath(os.path.dirname(__file__))
+components_cif_path = os.path.join(self_path, 'components.cif')
+codes_py_path = os.path.join(self_path, 'codes.py')

-p = argparse.ArgumentParser(
-description='Components.cif to codes.py converter',
-epilog='',
-formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+file_template = '''"""Tables of monomer codes.
+
+`dna`, `rna`, `protein` are lists of all known codes for monomers of given
+type. Each of them is a list of tuples of kind:
+
+ ( 1-letter code, is-modified?, 3-letter-code, fullname )
+
+`3-letter-code` is the code used in PDB (it may actually be one or
+two letters)
+
+"""
+
+dna = (
+%(dna)s
)

-r = argparse.FileType('r')
-w = argparse.FileType('w')
+protein = (
+%(protein)s
+)

-p.add_argument('-v','--version',action='version',version='%(prog)s 1.0')
-p.add_argument('-i',help='input components.cif',metavar='FILE',type=r,
- required=True)
-p.add_argument('-o',help='output codes.py',metavar='FILE',type=w,
- default=rel('codes.py'))
-p.add_argument('-t',help='Template for codes.py',metavar='FILE',type=r,
- default=rel('codes_template.txt'))
+rna = (
+%(rna)s
+)
+'''

-try:
- args = p.parse_args()
-except Exception, t:
- print t
- exit()
+entry_template = """('%(code1)s', %(is_modified)s, "%(code3)s", "%(name)s"),"""

-AAbank = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C',
- 'GLN':'Q', 'GLU':'E', 'GLY':'G', 'HIS':'H', 'ILE':'I',
- 'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P',
- 'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V',
- 'DA' :'A', 'DT' :'T', 'DG' :'G', 'DC' :'C',
- 'A': 'A', 'U' :'U', 'G' :'G', 'C' :'C'}
+types = {
+ 'protein': [],
+ 'dna': [],
+ 'rna': [],
+}

-protein = []
-dna = []
-rna = []
+unknown_code1 = {
+ 'protein': 'X',
+ 'dna': 'N',
+ 'rna': 'N',
+}

-def process_cif_entry(cif_entry):
- monomer_type = cif_entry['_chem_comp.type'].strip()
- if "PEPTIDE" in monomer_type:
- container = protein
- elif "DNA" in monomer_type:
- container = dna
- elif "RNA" in monomer_type:
- container = rna
+unmodified_codes3 = set((
+ "DA", "DC", "DG", "DT",
+ "ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU",
+ "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR",
+ "A", "C", "G", "U",
+))
+
+def main(options):
+ parse_cif(options.input_cif, process_entry)
+ add_entry(type='protein', code1='X', is_modified=False,
+ code3="", name="Undefined Aminoacid")
+ add_entry(type='dna', code1='N', is_modified=False,
+ code3="", name="Undefined DNA Nucelotide")
+ add_entry(type='rna', code1='N', is_modified=False,
+ code3="", name="Undefined RNA Nucelotide")
+ write_codes_py(options.output_py)
+
+def add_entry(**kw):
+ types[kw['type']].append(entry_template % kw)
+
+def write_codes_py(file):
+ data = {}
+ for type, entries in types.items():
+ data[type] = '\n'.join(sorted(entries))
+ file.write(file_template % data)
+
+def process_entry(entry):
+ monomer_type = entry['_chem_comp.type'].lower()
+ code1 = entry['_chem_comp.one_letter_code'].upper()
+ code3 = entry['_chem_comp.three_letter_code'].upper()
+ name = entry.get('_chem_comp.name', '').upper().replace('"', "''")
+ is_modified = True
+
+ if 'peptide' in monomer_type:
+ type = 'protein'
+ elif 'dna' in monomer_type:
+ type = 'dna'
+ elif 'rna' in monomer_type:
+ type = 'rna'
else:
return
- code1 = cif_entry['_chem_comp.one_letter_code'].strip()
- if code1 == '?':
- parent = cif_entry['_chem_comp.mon_nstd_parent_comp_id']
- parent = parent.strip().upper()
- if parent in AAbank:
- code1 = AAbank[parent].lower()
- else:
- if container == protein:
- code1 = 'x'
- else:
- code1 = 'n'
- code3 = cif_entry['_chem_comp.three_letter_code'].strip().upper()
- name = cif_entry['_chem_comp.name'].strip()
- if name[0] == '"' and name[-1] == '"':
- name = name[1:-1]
- modified = code3 not in AAbank
- code1 = code1.lower() if modified else code1.upper()
- container.append((code1, modified, code3, name))

-cif_entry = {}
+ if code1 == "?":
+ code1 = unknown_code1[type]

-for line in args.i:
- line = line.strip()
- if 'data_' in line and cif_entry:
- try:
- process_cif_entry(cif_entry)
- except:
- pass
- cif_entry = {}
- else:
- key_value = line.split(' ', 1)
- if len(key_value) == 2:
- key, value = key_value
- cif_entry[key] = value
+ if code3 in unmodified_codes3:
+ is_modified = False

-protein.append(('X', False, "", "Undefined Aminoacid"))
-dna.append(('N', False, "", "Undefined DNA Nucelotide"))
-rna.append(('N', False, "", "Undefined RNA Nucelotide"))
+ if is_modified:
+ code1 = code1.lower()

-protein.sort()
-dna.sort()
-rna.sort()
+ add_entry(**locals())

-template = args.t.read()
-args.o.write(template % {'protein': pformat(protein, width=1024),
- 'dna': pformat(dna, width=1024),
- 'rna': pformat(rna, width=1024)})
+def parse_cif(file, callback):
+ entry = None
+ key = None
+ for line_no, line in enumerate(file, 1):
+ line = line.strip()
+ if line.startswith("data_"):
+ if entry is not None:
+ callback(entry)
+ entry = {}
+ elif line.startswith("_"):
+ line_split = line.split(" ", 1)
+ key = line_split[0].strip().lower()
+ entry[key] = ""
+ if " " in line:
+ value = line_split[1].strip().strip('\'\"')
+ entry[key] = value
+ elif not line.startswith("#"):
+ entry[key] += line.lstrip(";").strip().strip('\'\"')
+ callback(entry)
+
+if __name__ == "__main__":
+ import optparse
+ parser = optparse.OptionParser()
+ parser.add_option('-i', '--input-cif',
+ help="input components.cif or aa-variants.cif",
+ default=components_cif_path)
+ parser.add_option('-o', '--output-py',
+ help="output codes.py file",
+ default=codes_py_path)
+ options, args = parser.parse_args()
+
+ try:
+ assert args == [], "No positional arguments are accepted"
+ options.input_cif = open(options.input_cif)
+ options.output_py = open(options.output_py, "w")
+ main(options)
+ except TypeError, e:
+ parser.error(e)