allpy
diff allpy/base.py @ 261:d60628e29b24
Moved contents of block.py to base.py without any changes.
author | Daniil Alexeyevsky <me.dendik@gmail.com> |
---|---|
date | Tue, 14 Dec 2010 21:02:49 +0300 |
parents | aae821828b03 |
children | e361a7b7d9aa |
line diff
1.1 --- a/allpy/base.py Tue Dec 14 20:51:53 2010 +0300 1.2 +++ b/allpy/base.py Tue Dec 14 21:02:49 2010 +0300 1.3 @@ -1,3 +1,10 @@ 1.4 +import sys 1.5 +import os 1.6 +from tempfile import NamedTemporaryFile 1.7 + 1.8 +import config 1.9 +from graph import Graph 1.10 +from Bio.PDB import Superimposer 1.11 from fasta import save_fasta 1.12 import data.codes 1.13 1.14 @@ -338,7 +345,161 @@ 1.15 for m in self.body[sequence]]) 1.16 1.17 class Block(object): 1.18 - """ """ 1.19 - pass 1.20 + """ Block of alignment 1.21 + 1.22 + Mandatory data: 1.23 + * self.alignment -- alignment object, which the block belongs to 1.24 + * self.sequences - set of sequence objects that contain monomers 1.25 + and/or gaps, that constitute the block 1.26 + * self.positions -- list of positions of the alignment.body that 1.27 + are included in the block; position[i+1] is always to the right from position[i] 1.28 + 1.29 + Don't change self.sequences -- it may be a link to other block.sequences 1.30 + 1.31 + How to create a new block: 1.32 + >>> import alignment 1.33 + >>> import block 1.34 + >>> proj = alignment.Alignment(open("test.fasta")) 1.35 + >>> block1 = block.Block(proj) 1.36 + """ 1.37 + 1.38 + def __init__(self, alignment, sequences=None, positions=None): 1.39 + """ Builds new block from alignment 1.40 + 1.41 + if sequences==None, all sequences are used 1.42 + if positions==None, all positions are used 1.43 + """ 1.44 + if sequences == None: 1.45 + sequences = set(alignment.sequences) # copy 1.46 + if positions == None: 1.47 + positions = range(len(alignment)) 1.48 + self.alignment = alignment 1.49 + self.sequences = sequences 1.50 + self.positions = positions 1.51 + 1.52 + def save_fasta(self, out_file, long_line=70, gap='-'): 1.53 + """ Saves alignment to given file in fasta-format 1.54 + 1.55 + No changes in the names, descriptions or order of the sequences 1.56 + are made. 1.57 + """ 1.58 + for sequence in self.sequences: 1.59 + alignment_monomers = self.alignment.body[sequence] 1.60 + block_monomers = [alignment_monomers[i] for i in self.positions] 1.61 + string = ''.join([m.type.code1 if m else '-' for m in block_monomers]) 1.62 + save_fasta(out_file, string, sequence.name, sequence.description, long_line) 1.63 + 1.64 + def geometrical_cores(self, max_delta=config.delta, 1.65 + timeout=config.timeout, minsize=config.minsize, 1.66 + ac_new_atoms=config.ac_new_atoms, 1.67 + ac_count=config.ac_count): 1.68 + """ Returns length-sorted list of blocks, representing GCs 1.69 + 1.70 + max_delta -- threshold of distance spreading 1.71 + timeout -- Bron-Kerbosh timeout (then fast O(n ln n) algorithm) 1.72 + minsize -- min size of each core 1.73 + ac_new_atoms -- min part or new atoms in new alternative core 1.74 + current GC is compared with each of already selected GCs 1.75 + if difference is less then ac_new_atoms, current GC is skipped 1.76 + difference = part of new atoms in current core 1.77 + ac_count -- max number of cores (including main core) 1.78 + -1 means infinity 1.79 + If more than one pdb chain for some sequence provided, consider all of them 1.80 + cost is calculated as 1 / (delta + 1) 1.81 + delta in [0, +inf) => cost in (0, 1] 1.82 + """ 1.83 + nodes = self.positions 1.84 + lines = {} 1.85 + for i in self.positions: 1.86 + for j in self.positions: 1.87 + if i < j: 1.88 + distances = [] 1.89 + for sequence in self.sequences: 1.90 + for chain in sequence.pdb_chains: 1.91 + m1 = self.alignment.body[sequence][i] 1.92 + m2 = self.alignment.body[sequence][j] 1.93 + if m1 and m2: 1.94 + r1 = sequence.pdb_residues[chain][m1] 1.95 + r2 = sequence.pdb_residues[chain][m2] 1.96 + ca1 = r1['CA'] 1.97 + ca2 = r2['CA'] 1.98 + d = ca1 - ca2 # Bio.PDB feature 1.99 + distances.append(d) 1.100 + if len(distances) >= 2: 1.101 + delta = max(distances) - min(distances) 1.102 + if delta <= max_delta: 1.103 + lines[Graph.line(i, j)] = 1.0 / (1.0 + max_delta) 1.104 + graph = Graph(nodes, lines) 1.105 + cliques = graph.cliques(timeout=timeout, minsize=minsize) 1.106 + GCs = [] 1.107 + for clique in cliques: 1.108 + for GC in GCs: 1.109 + if len(clique - set(GC.positions)) < ac_new_atoms * len(clique): 1.110 + break 1.111 + else: 1.112 + GCs.append(Block(self.alignment, self.sequences, clique)) 1.113 + if ac_count != -1 and len(GCs) >= ac_count: 1.114 + break 1.115 + return GCs 1.116 + 1.117 + def xstring(self, x='X', gap='-'): 1.118 + """ Returns string consisting of gap chars and chars x at self.positions 1.119 + 1.120 + Length of returning string = length of alignment 1.121 + """ 1.122 + monomers = [False] * len(self.alignment) 1.123 + for i in self.positions: 1.124 + monomers[i] = True 1.125 + return ''.join([x if m else gap for m in monomers]) 1.126 + 1.127 + def save_xstring(self, out_file, name, description='', x='X', gap='-', long_line=70): 1.128 + """ Save xstring and name in fasta format """ 1.129 + save_fasta(out_file, self.xstring(x=x, gap=gap), name, description, long_line) 1.130 + 1.131 + def monomers(self, sequence): 1.132 + """ Iterates monomers of this sequence from this block """ 1.133 + alignment_sequence = self.alignment.body[sequence] 1.134 + return (alignment_sequence[i] for i in self.positions) 1.135 + 1.136 + def ca_atoms(self, sequence, pdb_chain): 1.137 + """ Iterates Ca-atom of monomers of this sequence from this block """ 1.138 + return (sequence.pdb_residues[pdb_chain][monomer] for monomer in self.monomers()) 1.139 + 1.140 + def sequences_chains(self): 1.141 + """ Iterates pairs (sequence, chain) """ 1.142 + for sequence in self.alignment.sequences: 1.143 + if sequence in self.sequences: 1.144 + for chain in sequence.pdb_chains: 1.145 + yield (sequence, chain) 1.146 + 1.147 + def superimpose(self): 1.148 + """ Superimpose all pdb_chains in this block """ 1.149 + sequences_chains = list(self.sequences_chains()) 1.150 + if len(sequences_chains) >= 1: 1.151 + sup = Superimposer() 1.152 + fixed_sequence, fixed_chain = sequences_chains.pop() 1.153 + fixed_atoms = self.ca_atoms(fixed_sequence, fixed_chain) 1.154 + for sequence, chain in sequences_chains: 1.155 + moving_atoms = self.ca_atoms(sequence, chain) 1.156 + sup.set_atoms(fixed_atoms, moving_atoms) 1.157 + # Apply rotation/translation to the moving atoms 1.158 + sup.apply(moving_atoms) 1.159 + 1.160 + def pdb_save(self, out_file): 1.161 + """ Save all sequences 1.162 + 1.163 + Returns {(sequence, chain): CHAIN} 1.164 + CHAIN is chain letter in new file 1.165 + """ 1.166 + tmp_file = NamedTemporaryFile(delete=False) 1.167 + tmp_file.close() 1.168 + 1.169 + for sequence, chain in self.sequences_chains(): 1.170 + sequence.pdb_save(tmp_file.name, chain) 1.171 + # TODO: read from tmp_file.name 1.172 + # change CHAIN 1.173 + # add to out_file 1.174 + 1.175 + os.unlink(NamedTemporaryFile) 1.176 1.177 # vim: set ts=4 sts=4 sw=4 et: