allpy: 675b402094be lib/sequence.py

allpy

view lib/sequence.py @ 151:675b402094be

day commit -- a lot of changes fasta.py: universal save_fasta() determine_long_line -- for determine length of fasta sequence string in user input everywhere: standart long_line=60 --> 70 blocK.sequences_chains: returns sequences in order as in project added monomer pdb_secstr to store secondary structure pdb adding: some improvements and fixes fix in from_pdb_chain: use all peptides, not only first Sequence.pdb_files added to store information about pdb file for each chain dssp bindings to get secondary structure /sec_str -- tool to map secondary structure on each sequence of alignment

author	boris (netbook) <bnagaev@gmail.com>
date	Tue, 26 Oct 2010 00:40:36 +0400
parents	f7dead025719
children	0c7f6117481b

line source

1 #!/usr/bin/python

2 from monomer import AminoAcidType

3 from Bio.PDB import PDBParser, CaPPBuilder, PDBIO

4 from Bio.PDB.DSSP import make_dssp_dict

5 from allpy_pdb import std_id, pdb_id_parse

6 import project

7 import sys

8 import config

9 import os.path

10 import urllib2

11 from tempfile import NamedTemporaryFile

12 import os

15 class Sequence(object):

16 """ Sequence of Monomers

18 Mandatory data:

19 * name -- str with the name of sequence

20 * description -- str with description of the sequence

21 * monomers -- list of monomer objects (aminoacids or nucleotides)

22 * pdb_chains -- list of Bio.PDB.Chain's

23 * pdb_files -- dictionary like {Bio.PDB.Chain: file_obj}

25 ?TODO: global pdb_structures

26 """

27 def __init__(self, monomers=None, name='', description=""):

28 if not monomers:

29 monomers = []

30 self.name = name

31 self.description = description

32 self.monomers = monomers

33 self.pdb_chains = []

34 self.pdb_files = {}

36 def __len__(self):

37 return len(self.monomers)

39 def __str__(self):

40 """ Returns sequence in one-letter code """

41 return ''.join([monomer.type.code1 for monomer in self.monomers])

43 def __eq__(self, other):

44 """ Returns if all corresponding monomers of this sequences are equal

46 If lengths of sequences are not equal, returns False

47 """

48 return len(self) == len(other) and \

49 all([a==b for a, b in zip(self.monomers, other.monomers)])

51 def __ne__(self, other):

52 return not (self == other)

54 def pdb_chain_add(self, pdb_file, pdb_id, pdb_chain, pdb_model=0):

55 """ Reads Pdb chain from file

57 and align each Monomer with PDB.Residue (TODO)

58 """

59 name = std_id(pdb_id, pdb_chain, pdb_model)

60 structure = PDBParser().get_structure(name, pdb_file)

61 chain = structure[pdb_model][pdb_chain]

62 self.pdb_chains.append(chain)

63 pdb_sequence = Sequence.from_pdb_chain(chain)

64 alignment = project.Project.from_sequences(self, pdb_sequence)

65 alignment.muscle_align()

66 for monomer, pdb_monomer in alignment.column(sequence=pdb_sequence, original=self):

67 if pdb_monomer and chain in pdb_monomer.pdb_residues:

68 monomer.pdb_residue_add(chain, pdb_monomer.pdb_residues[chain])

69 self.pdb_files[chain] = pdb_file

71 @staticmethod

72 def from_str(fasta_str, name='', description='', monomer_kind=AminoAcidType):

73 """ Import data from one-letter code

75 monomer_kind is class, inherited from MonomerType

76 """

77 monomers = [monomer_kind.from_code1(aa).instance() for aa in fasta_str]

78 return Sequence(monomers, name, description)

80 @staticmethod

81 def from_pdb_chain(chain):

82 """ Returns Sequence with Monomers with link to Bio.PDB.Residue

84 chain is Bio.PDB.Chain

85 """

86 cappbuilder = CaPPBuilder()

87 peptides = cappbuilder.build_peptides(chain)

88 sequence = Sequence()

89 sequence.pdb_chains = [chain]

90 for peptide in peptides:

91 for ca_atom in peptide.get_ca_list():

92 residue = ca_atom.get_parent()

93 monomer = AminoAcidType.from_pdb_residue(residue).instance()

94 monomer.pdb_residue_add(chain, residue)

95 sequence.monomers.append(monomer)

96 return sequence

98 def pdb_auto_add(self, conformity_info=None, pdb_directory='./pdb'):

99 """ Adds pdb information to each monomer

100

101 Returns if information has been successfully added

102 TODO: conformity_file

103

104 id-format lava flow

105 """

106 if not conformity_info:

107 path = os.path.join(pdb_directory, self.name)

108 if os.path.exists(path):

109 match = pdb_id_parse(self.name)

110 self.pdb_chain_add(open(path), match['code'],

111 match['chain'], match['model'])

112 else:

113 match = pdb_id_parse(self.name)

114 if match:

115 code = match['code']

116 pdb_filename = config.pdb_dir % code

117 if not os.path.exists(pdb_filename):

118 url = config.pdb_url % code

119 print "Download %s" % url

120 pdb_file = open(pdb_filename, 'w')

121 data = urllib2.urlopen(url).read()

122 pdb_file.write(data)

123 pdb_file.close()

124 print "Save %s" % pdb_filename

125 pdb_file = open(pdb_filename)

126 self.pdb_chain_add(pdb_file, code, match['chain'], match['model'])

127

128 def pdb_save(self, out_filename, pdb_chain):

129 """ Saves pdb_chain to out_file """

130 class GlySelect(Select):

131 def accept_chain(self, chain):

132 if chain == pdb_chain:

133 return 1

134 else:

135 return 0

136 io = PDBIO()

137 structure = chain.get_parent()

138 io.set_structure(structure)

139 io.save(out_filename, GlySelect())

140

141

142 def pdb_add_sec_str(self, pdb_chain):

143 """ Add secondary structure data """

144 tmp_file = NamedTemporaryFile(delete=False)

145 tmp_file.close()

146 pdb_file = self.pdb_files[pdb_chain].name

147 os.system("dsspcmbi %(pdb)s %(tmp)s" % {'pdb': pdb_file, 'tmp': tmp_file.name})

148 dssp, keys = make_dssp_dict(tmp_file.name)

149 for monomer in self.monomers:

150 if pdb_chain in monomer.pdb_residues:

151 residue = monomer.pdb_residues[pdb_chain]

152 try:

153 d = dssp[(pdb_chain.get_id(), residue.get_id())]

154 monomer.pdb_secstr[pdb_chain] = d[1]

155 except:

156 print "No dssp information about %s at %s" % (monomer, pdb_chain)

157 os.unlink(tmp_file.name)

158