allpy: 596bdc5897bf allpy/sequence.py

allpy

view allpy/sequence.py @ 195:596bdc5897bf

repeats: function repeat_joiner.full_repeats() seems completed

author	boris <bnagaev@gmail.com>
date	Fri, 19 Nov 2010 00:17:11 +0300
parents	a016cad5f5a0
children	346a6cd4fc1d

line source

1 #!/usr/bin/python

2 # -*- coding: utf-8 -*-

4 from monomer import AminoAcidType

5 from Bio.PDB import CaPPBuilder, PDBIO

6 from Bio.PDB.DSSP import make_dssp_dict

7 from allpy_pdb import std_id, pdb_id_parse, get_structure

8 import alignment

9 import sys

10 import config

11 import os.path

12 import urllib2

13 from tempfile import NamedTemporaryFile

14 import os

17 class Sequence(object):

18 """ Sequence of Monomers

20 Mandatory data:

21 * name -- str with the name of sequence

22 * description -- str with description of the sequence

23 * monomers -- list of monomer objects (aminoacids or nucleotides)

25 Optional (may be empty):

26 * pdb_chains -- list of Bio.PDB.Chain's

27 * pdb_files -- dictionary like {Bio.PDB.Chain: file_obj}

29 * pdb_residues -- dictionary like {Bio.PDB.Chain: {Monomer: Bio.PDB.Residue}}

30 * pdb_secstr -- dictionary like {Bio.PDB.Chain: {Monomer: 'Secondary structure'}}

31 Code Secondary structure

32 H alpha-helix

33 B Isolated beta-bridge residue

34 E Strand

35 G 3-10 helix

36 I pi-helix

37 T Turn

38 S Bend

39 - Other

42 ?TODO: global pdb_structures

43 """

44 def __init__(self, monomers=None, name='', description=""):

45 if not monomers:

46 monomers = []

47 self.name = name

48 self.description = description

49 self.monomers = monomers

50 self.pdb_chains = []

51 self.pdb_files = {}

52 self.pdb_residues = {}

53 self.pdb_secstr = {}

55 def __len__(self):

56 return len(self.monomers)

58 def __str__(self):

59 """ Returns sequence in one-letter code """

60 return ''.join([monomer.type.code1 for monomer in self.monomers])

62 def __eq__(self, other):

63 """ Returns if all corresponding monomers of this sequences are equal

65 If lengths of sequences are not equal, returns False

66 """

67 return len(self) == len(other) and \

68 all([a==b for a, b in zip(self.monomers, other.monomers)])

70 def __ne__(self, other):

71 return not (self == other)

73 def pdb_chain_add(self, pdb_file, pdb_id, pdb_chain, pdb_model=0):

74 """ Reads Pdb chain from file

76 and align each Monomer with PDB.Residue (TODO)

77 """

78 name = std_id(pdb_id, pdb_chain, pdb_model)

79 structure = get_structure(pdb_file, name)

80 chain = structure[pdb_model][pdb_chain]

81 self.pdb_chains.append(chain)

82 self.pdb_residues[chain] = {}

83 self.pdb_secstr[chain] = {}

84 pdb_sequence = Sequence.from_pdb_chain(chain)

85 a = alignment.Alignment.from_sequences(self, pdb_sequence)

86 a.muscle_align()

87 for monomer, pdb_monomer in a.column(sequence=pdb_sequence, original=self):

88 if pdb_sequence.pdb_has(chain, pdb_monomer):

89 residue = pdb_sequence.pdb_residues[chain][pdb_monomer]

90 self.pdb_residues[chain][monomer] = residue

91 self.pdb_files[chain] = pdb_file

93 def pdb_unload(self):

94 """ Delete all pdb-connected links """

95 #~ gc.get_referrers(self.pdb_chains[0])

96 self.pdb_chains = []

97 self.pdb_residues = {}

98 self.pdb_secstr = {} # FIXME

99 self.pdb_files = {} # FIXME

100

101 @staticmethod

102 def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType):

103 """ Import data from one-letter code

104

105 monomer_kind is class, inherited from MonomerType

106 """

107 monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str]

108 return Sequence(monomers, name, description)

109

110 @staticmethod

111 def from_pdb_chain(chain):

112 """ Returns Sequence with Monomers with link to Bio.PDB.Residue

113

114 chain is Bio.PDB.Chain

115 """

116 cappbuilder = CaPPBuilder()

117 peptides = cappbuilder.build_peptides(chain)

118 sequence = Sequence()

119 sequence.pdb_chains = [chain]

120 sequence.pdb_residues[chain] = {}

121 sequence.pdb_secstr[chain] = {}

122 for peptide in peptides:

123 for ca_atom in peptide.get_ca_list():

124 residue = ca_atom.get_parent()

125 monomer = AminoAcidType.from_pdb_residue(residue).instance()

126 sequence.pdb_residues[chain][monomer] = residue

127 sequence.monomers.append(monomer)

128 return sequence

129

130 def pdb_auto_add(self, conformity_info=None, pdb_directory='./tmp'):

131 """ Adds pdb information to each monomer

132

133 Returns if information has been successfully added

134 TODO: conformity_file

135

136 id-format lava flow

137 """

138 if not conformity_info:

139 path = os.path.join(pdb_directory, self.name)

140 if os.path.exists(path) and os.path.getsize(path):

141 match = pdb_id_parse(self.name)

142 self.pdb_chain_add(open(path), match['code'],

143 match['chain'], match['model'])

144 else:

145 match = pdb_id_parse(self.name)

146 if match:

147 code = match['code']

148 pdb_filename = config.pdb_dir % code

149 if not os.path.exists(pdb_filename) or not os.path.getsize(pdb_filename):

150 url = config.pdb_url % code

151 print "Download %s" % url

152 pdb_file = open(pdb_filename, 'w')

153 data = urllib2.urlopen(url).read()

154 pdb_file.write(data)

155 pdb_file.close()

156 print "Save %s" % pdb_filename

157 pdb_file = open(pdb_filename)

158 self.pdb_chain_add(pdb_file, code, match['chain'], match['model'])

159

160 def pdb_save(self, out_filename, pdb_chain):

161 """ Saves pdb_chain to out_file """

162 class GlySelect(Select):

163 def accept_chain(self, chain):

164 if chain == pdb_chain:

165 return 1

166 else:

167 return 0

168 io = PDBIO()

169 structure = chain.get_parent()

170 io.set_structure(structure)

171 io.save(out_filename, GlySelect())

172

173

174 def pdb_add_sec_str(self, pdb_chain):

175 """ Add secondary structure data """

176 tmp_file = NamedTemporaryFile(delete=False)

177 tmp_file.close()

178 pdb_file = self.pdb_files[pdb_chain].name

179 os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name})

180 dssp, keys = make_dssp_dict(tmp_file.name)

181 for monomer in self.monomers:

182 if self.pdb_has(pdb_chain, monomer):

183 residue = self.pdb_residues[pdb_chain][monomer]

184 try:

185 d = dssp[(pdb_chain.get_id(), residue.get_id())]

186 self.pdb_secstr[pdb_chain][monomer] = d[1]

187 except:

188 print "No dssp information about %s at %s" % (monomer, pdb_chain)

189 os.unlink(tmp_file.name)

190

191 def pdb_has(self, chain, monomer):

192 return chain in self.pdb_residues and monomer in self.pdb_residues[chain]

193

194 def secstr_has(self, chain, monomer):

195 return chain in self.pdb_secstr and monomer in self.pdb_secstr[chain]

196

197 @staticmethod

198 def file_slice(file, n_from, n_to, fasta_name='', name='', description='', monomer_kind=AminoAcidType):

199 """ Build and return sequence, consisting of part of sequence from file

200

201 Does not control gaps

202 """

203 inside = False

204 number_used = 0

205 s = ''

206 for line in file:

207 line = line.split()

208 if not inside:

209 if line.startswith('>%s' % fasta_name):

210 inside = True

211 else:

212 n = len(line)

213 s += line[(n_from - number_user):(n_to - number_user)]

214 return Sequence.from_str(s, name, description, monomer_kind)