rev |
line source |
bnagaev@147
|
1 """ Functions to get pdb information from fasta id |
bnagaev@138
|
2 and to generate fasta id from pdb information |
bnagaev@138
|
3 |
bnagaev@138
|
4 pdb information: code, chain, model |
bnagaev@138
|
5 |
bnagaev@138
|
6 TODO: same for local pdb files |
bnagaev@138
|
7 """ |
bnagaev@138
|
8 |
bnagaev@248
|
9 import re |
me@308
|
10 import os |
me@308
|
11 import os.path |
me@308
|
12 from tempfile import NamedTemporaryFile |
me@308
|
13 import urllib2 |
me@308
|
14 |
bnagaev@248
|
15 from Bio.PDB import PDBParser |
me@274
|
16 from Bio.PDB import Superimposer, CaPPBuilder, PDBIO |
me@308
|
17 from Bio.PDB.DSSP import make_dssp_dict |
me@308
|
18 |
me@308
|
19 import base |
bnagaev@385
|
20 import processors |
bnagaev@385
|
21 import config |
me@314
|
22 from graph import Graph |
bnagaev@248
|
23 |
bnagaev@248
|
24 |
bnagaev@79
|
25 # for pdb-codes |
bnagaev@151
|
26 re1 = re.compile(r"(^|[^a-z0-9])(?P<code>[0-9][0-9a-z]{3})([^a-z0-9](?P<chain>[0-9a-z ]?)(?P<model>[^a-z0-9]([0-9]{1,3}))?)?", re.I) |
bnagaev@79
|
27 |
bnagaev@138
|
28 def pdb_id_parse(ID): |
bnagaev@139
|
29 match = re1.search(ID) |
bnagaev@138
|
30 if not match: |
bnagaev@138
|
31 return None |
bnagaev@139
|
32 d = match.groupdict() |
bnagaev@139
|
33 if 'chain' not in d or not d['chain']: |
bnagaev@139
|
34 d['chain'] = ' ' |
bnagaev@139
|
35 if 'model' not in d or not d['model']: |
bnagaev@139
|
36 d['model'] = 0 |
bnagaev@139
|
37 return d |
me@270
|
38 |
me@270
|
39 |
bnagaev@156
|
40 def get_structure(file, name): |
bnagaev@157
|
41 return PDBParser().get_structure(name, file) |
me@270
|
42 |
me@345
|
43 class SequenceMixin(base.Sequence): |
me@345
|
44 """Mixin for adding PDB data to a Sequence. |
me@345
|
45 |
me@345
|
46 Please note: since this is a mixin, no objects of this class should be |
me@345
|
47 created. This class is intended to be subclassed together with one of |
me@345
|
48 Sequence classes. |
me@274
|
49 |
me@274
|
50 Attributes: |
me@274
|
51 |
me@274
|
52 * pdb_chain -- Bio.PDB.Chain |
me@274
|
53 * pdb_residues -- {Monomer: Bio.PDB.Residue} |
me@274
|
54 |
me@274
|
55 ?TODO: global pdb_structures |
me@274
|
56 """ |
me@274
|
57 |
bnagaev@385
|
58 def set_pdb_chain(self, pdb_file, pdb_id, pdb_chain='A', pdb_model=0): |
me@274
|
59 """ Reads Pdb chain from file |
me@274
|
60 |
me@274
|
61 and align each Monomer with PDB.Residue (TODO) |
me@274
|
62 """ |
bnagaev@385
|
63 structure = get_structure(pdb_file, self.name) |
me@274
|
64 chain = structure[pdb_model][pdb_chain] |
bnagaev@385
|
65 self.pdb_chain = chain |
bnagaev@385
|
66 self.pdb_residues = {} |
bnagaev@385
|
67 pdb_sequence = self.__class__.from_pdb_chain(chain) |
bnagaev@385
|
68 Alignment = self.types.Alignment |
bnagaev@385
|
69 a = Alignment() |
bnagaev@385
|
70 a.append_sequence(self) |
bnagaev@385
|
71 a.append_sequence(pdb_sequence) |
bnagaev@385
|
72 a.process(processors.Muscle()) |
bnagaev@385
|
73 for monomer, pdb_monomer in a.columns_as_lists(): |
bnagaev@385
|
74 residue = pdb_sequence.pdb_residues[pdb_monomer] |
bnagaev@385
|
75 self.pdb_residues[monomer] = residue |
me@274
|
76 |
me@274
|
77 def pdb_unload(self): |
me@274
|
78 """ Delete all pdb-connected links """ |
bnagaev@385
|
79 del self.pdb_chain |
bnagaev@385
|
80 del self.pdb_residues |
me@274
|
81 |
bnagaev@385
|
82 @classmethod |
bnagaev@385
|
83 def from_pdb_chain(cls, chain): |
me@274
|
84 """ Returns Sequence with Monomers with link to Bio.PDB.Residue |
me@274
|
85 |
me@274
|
86 chain is Bio.PDB.Chain |
me@274
|
87 """ |
me@274
|
88 cappbuilder = CaPPBuilder() |
me@274
|
89 peptides = cappbuilder.build_peptides(chain) |
bnagaev@385
|
90 Sequence = cls |
bnagaev@385
|
91 Monomer = Sequence.types.Monomer |
me@274
|
92 sequence = Sequence() |
bnagaev@385
|
93 sequence.pdb_chain = chain |
bnagaev@385
|
94 sequence.pdb_residues = {} |
me@274
|
95 for peptide in peptides: |
me@274
|
96 for ca_atom in peptide.get_ca_list(): |
me@274
|
97 residue = ca_atom.get_parent() |
bnagaev@385
|
98 monomer = Monomer.from_code3(residue.get_resname()) |
bnagaev@385
|
99 sequence.pdb_residues[monomer] = residue |
bnagaev@385
|
100 sequence.append(monomer) |
me@274
|
101 return sequence |
me@274
|
102 |
bnagaev@385
|
103 def auto_pdb(self, conformity=None, dir='/tmp', mask='%s.pdb'): |
me@274
|
104 """ Adds pdb information to each monomer |
me@274
|
105 |
me@274
|
106 Returns if information has been successfully added |
me@274
|
107 TODO: conformity_file |
me@274
|
108 |
me@274
|
109 id-format lava flow |
me@274
|
110 """ |
bnagaev@385
|
111 match = pdb_id_parse(self.name) |
bnagaev@385
|
112 if not match: |
bnagaev@385
|
113 return False |
bnagaev@385
|
114 code = match['code'] |
bnagaev@385
|
115 chain = match['chain'] |
bnagaev@385
|
116 model = match['model'] |
bnagaev@385
|
117 user_path = os.path.join(dir, mask % self.name) |
bnagaev@385
|
118 path = user_path |
bnagaev@385
|
119 if not os.path.exists(path) or not os.path.getsize(path): |
bnagaev@385
|
120 path = config.pdb_path % code |
bnagaev@385
|
121 if not os.path.exists(path) or not os.path.getsize(path): |
bnagaev@385
|
122 url = config.pdb_url % code |
bnagaev@385
|
123 print "Download %s" % url |
bnagaev@385
|
124 path = user_path |
bnagaev@385
|
125 data = urllib2.urlopen(url).read() |
bnagaev@385
|
126 pdb_file = open(path, 'w') |
bnagaev@385
|
127 pdb_file.write(data) |
bnagaev@385
|
128 pdb_file.close() |
bnagaev@385
|
129 print "Save %s" % path |
bnagaev@385
|
130 self.set_pdb_chain(open(path), code, chain, model) |
bnagaev@385
|
131 return True |
me@274
|
132 |
me@274
|
133 def pdb_save(self, out_filename, pdb_chain): |
me@274
|
134 """ Saves pdb_chain to out_file """ |
bnagaev@385
|
135 class MySelect(Select): |
me@274
|
136 def accept_chain(self, chain): |
me@274
|
137 if chain == pdb_chain: |
me@274
|
138 return 1 |
me@274
|
139 else: |
me@274
|
140 return 0 |
me@274
|
141 io = PDBIO() |
me@274
|
142 structure = chain.get_parent() |
me@274
|
143 io.set_structure(structure) |
bnagaev@385
|
144 io.save(out_filename, MySelect()) |
me@274
|
145 |
bnagaev@385
|
146 def ca_atoms(self, sequence): |
bnagaev@385
|
147 """ Iterates Ca-atom of monomers of this sequence """ |
bnagaev@385
|
148 return (pdb_residues[monomer]['CA'] for monomer in self) |
me@296
|
149 |
me@345
|
150 class AlignmentMixin(base.Alignment): |
me@345
|
151 """Mixin to add 3D properties to alignments. |
me@345
|
152 |
me@345
|
153 Please note: since this is a mixin, no objects of this class should be |
me@345
|
154 created. This class is intended to be subclassed together with one of |
me@345
|
155 Alignment classes. |
me@345
|
156 """ |
bnagaev@385
|
157 pass |
me@296
|
158 |
me@345
|
159 class BlockMixin(base.Block): |
me@345
|
160 """Mixin to add 3D properties to blocks. |
me@345
|
161 |
me@345
|
162 Please note: since this is a mixin, no objects of this class should be |
me@345
|
163 created. This class is intended to be subclassed together with one of |
me@345
|
164 Block classes. |
me@345
|
165 """ |
me@308
|
166 |
me@309
|
167 def geometrical_cores(self, max_delta=config.delta, |
me@309
|
168 timeout=config.timeout, minsize=config.minsize, |
bnagaev@385
|
169 ac_new_atoms=config.ac_new_atoms, ac_count=config.ac_count): |
bnagaev@385
|
170 """ Returns length-sorted list of GCs |
bnagaev@385
|
171 GC is set of columns |
me@309
|
172 |
me@309
|
173 * max_delta -- threshold of distance spreading |
me@309
|
174 * timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) |
me@309
|
175 * minsize -- min size of each core |
me@309
|
176 * ac_new_atoms -- min part or new atoms in new alternative core |
me@309
|
177 current GC is compared with each of already selected GCs if |
me@309
|
178 difference is less then ac_new_atoms, current GC is skipped |
me@309
|
179 difference = part of new atoms in current core |
me@309
|
180 * ac_count -- max number of cores (including main core) |
me@309
|
181 -1 means infinity |
me@309
|
182 |
me@309
|
183 cost is calculated as 1 / (delta + 1) |
me@309
|
184 delta in [0, +inf) => cost in (0, 1] |
me@309
|
185 """ |
bnagaev@385
|
186 nodes = self.columns |
me@309
|
187 lines = {} |
bnagaev@385
|
188 for i, column1 in enumerate(self.columns): |
bnagaev@385
|
189 for j, column2 in enumerate(self.columns): |
me@309
|
190 if i < j: |
me@309
|
191 distances = [] |
me@309
|
192 for sequence in self.sequences: |
bnagaev@385
|
193 if sequence in column1 and sequence in column2: |
bnagaev@385
|
194 m1 = column1[sequence] |
bnagaev@385
|
195 m2 = column2[sequence] |
bnagaev@385
|
196 r1 = sequence.pdb_residues[m1] |
bnagaev@385
|
197 r2 = sequence.pdb_residues[m2] |
bnagaev@385
|
198 ca1 = r1['CA'] |
bnagaev@385
|
199 ca2 = r2['CA'] |
bnagaev@385
|
200 d = ca1 - ca2 # Bio.PDB feature |
bnagaev@385
|
201 distances.append(d) |
me@309
|
202 if len(distances) >= 2: |
me@309
|
203 delta = max(distances) - min(distances) |
me@309
|
204 if delta <= max_delta: |
bnagaev@385
|
205 line = Graph.line(column1, column2) |
bnagaev@385
|
206 lines[line] = 1.0 / (1.0 + max_delta) |
me@309
|
207 graph = Graph(nodes, lines) |
me@309
|
208 cliques = graph.cliques(timeout=timeout, minsize=minsize) |
me@309
|
209 GCs = [] |
me@309
|
210 for clique in cliques: |
me@309
|
211 for GC in GCs: |
bnagaev@385
|
212 new_nodes = clique - GC |
bnagaev@386
|
213 if len(new_nodes) < ac_new_atoms * len(clique): |
me@309
|
214 break |
bnagaev@385
|
215 else: |
bnagaev@385
|
216 GCs.append(clique) |
bnagaev@385
|
217 return GCs |
me@309
|
218 |
me@308
|
219 |
me@308
|
220 |
me@308
|
221 def superimpose(self): |
me@308
|
222 """ Superimpose all pdb_chains in this block """ |
me@308
|
223 sequences_chains = list(self.sequences_chains()) |
me@308
|
224 if len(sequences_chains) >= 1: |
me@308
|
225 sup = Superimposer() |
me@308
|
226 fixed_sequence, fixed_chain = sequences_chains.pop() |
me@308
|
227 fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain) |
me@308
|
228 for sequence, chain in sequences_chains: |
me@308
|
229 moving_atoms = self.ca_atoms(sequence, chain) |
me@308
|
230 sup.set_atoms(fixed_atoms, moving_atoms) |
me@308
|
231 # Apply rotation/translation to the moving atoms |
me@308
|
232 sup.apply(moving_atoms) |
me@308
|
233 |
me@308
|
234 def pdb_save(self, out_file): |
me@308
|
235 """ Save all sequences |
me@308
|
236 |
me@308
|
237 Returns {(sequence, chain): CHAIN} |
me@308
|
238 CHAIN is chain letter in new file |
me@308
|
239 """ |
me@308
|
240 tmp_file = NamedTemporaryFile(delete=False) |
me@308
|
241 tmp_file.close() |
me@308
|
242 |
me@308
|
243 for sequence, chain in self.sequences_chains(): |
me@308
|
244 sequence.pdb_save(tmp_file.name, chain) |
me@308
|
245 # TODO: read from tmp_file.name |
me@308
|
246 # change CHAIN |
me@308
|
247 # add to out_file |
me@308
|
248 |
me@308
|
249 os.unlink(NamedTemporaryFile) |
me@308
|
250 |
me@274
|
251 # vim: set ts=4 sts=4 sw=4 et: |