allpy: 3d40d7b9a123 lib/project.py

allpy

view lib/project.py @ 98:3d40d7b9a123

lib::project::muscle_align project::save_fasta sequence::operator== fixes test -- alignment done

author	boris <bnagaev@gmail.com>
date	Wed, 20 Oct 2010 23:33:29 +0400
parents	7dd461d17f96
children	ffc102ed0249

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 from monomer import AminoAcidType

12 import allpy_data

13 import os

14 from tempfile import mkstemp

16 class Project(object):

17 """

18 Mandatory data:

19 * sequences -- list of Sequence objects. Sequences don't contain gaps

20 - see sequence.py module

21 * alignment -- dict

22 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

23 keys are the Sequence objects, values are the lists, which

24 contain monomers of those sequences or None for gaps in the

25 corresponding sequence of

26 alignment

28 """

29 def __init__(self, *args):

30 """overloaded constructor

32 Project() -> new empty Project

33 Project(sequences, alignment) -> new Project with sequences and

34 alignment initialized from arguments

35 Project(fasta_file) -> new Project, read alignment and sequences

36 from fasta file

38 """

39 if len(args)>1:#overloaded constructor

40 self.sequences=args[0]

41 self.alignment=args[1]

42 elif len(args)==0:

43 self.sequences=[]

44 self.alignment={}

45 else:

46 self.sequences,self.alignment=Project.from_fasta(args[0])

48 def __len__(self):

49 return max([len(line) for line in self.alignment.values()])

51 def thickness(self):

52 """ The number of sequences in alignment (it's thickness).

53 """

54 return len(self.alignment)

56 def calc_identity(self):

57 """ Calculate the identity of alignment positions for colouring.

59 For every (row, column) in alignment the percentage of the exactly

60 same residue in the same column in the alignment is calculated.

61 The data structure is just like the Project.alignment, but istead of

62 monomers it contains float percentages.

63 """

64 # Oh, God, that's awful! Absolutely not understandable.

65 # First, calculate percentages of amino acids in every column

66 contribution = 1.0 / len(self.sequences)

67 all_columns = []

68 for position in range(len(self)):

69 column_percentage = {}

70 for seq in self.alignment:

71 if self.alignment[seq][position] is not None:

72 aa = self.alignment[seq][position].code

73 else:

74 aa = None

75 if aa in allpy_data.amino_acids:

76 if aa in column_percentage.keys():

77 column_percentage[aa] += contribution

78 else:

79 column_percentage[aa] = contribution

80 all_columns.append(column_percentage)

81 # Second, map these percentages onto the alignment

82 self.identity_percentages = {}

83 for seq in self.sequences:

84 self.identity_percentages[seq] = []

85 for seq in self.identity_percentages:

86 line = self.identity_percentages[seq]

87 for position in range(len(self)):

88 if self.alignment[seq][position] is not None:

89 aa = self.alignment[seq][position].code

90 else:

91 aa = None

92 line.append(all_columns[position].get(aa))

93 return self.identity_percentages

95 @staticmethod

96 def from_fasta(file, monomer_type=AminoAcidType):

97 """

98 >>> import project

99 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))

100 """

101 import re

102

103 sequences=[]

104 alignment={}

105

106 content=file.read()

107 raw_sequences=content.split(">")[1:]#ignore everything before the first >

108 for raw in raw_sequences:

109 parsed_raw_sequence = raw.split("\n")

110 for counter,piece in enumerate(parsed_raw_sequence):

111 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

112 name_and_description = parsed_raw_sequence[0]

113 if len(name_and_description.split(" ",1))==2:

114 name, description = name_and_description.split(" ",1)

115 elif len(name_and_description.split(" ", 1)) == 1:#if there is description

116 name = name_and_description

117 else:

118 raise "Wrong name of sequence in fasta file"

119 string=""

120 for piece in parsed_raw_sequence[1:]:

121 piece_without_whitespace_chars=re.sub("\s","",piece)

122 string+=piece_without_whitespace_chars

123 monomers=[]#convert into Monomer objects

124 alignment_list=[]#create the respective list in alignment dict

125 for current_monomer in string:

126 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

127 monomers.append(monomer_type.from_code1(current_monomer).instance())

128 alignment_list.append(monomers[-1])

129 else:

130 alignment_list.append(None)

131 if "description" in vars():#if there's no description

132 sequences.append(sequence.Sequence(name,monomers,description))

133 else:

134 sequences.append(sequence.Sequence(name,monomers))

135 alignment[sequences[-1]]=alignment_list

136 return sequences,alignment

137

138

139 @staticmethod

140 def from_sequences(*sequences):

141 """

142 Constructs new alignment from sequences

143 Add gaps to right end to make equal lengthes of alignment sequences

144 """

145 project = Project()

146 project.sequences = sequences

147 max_length = max(len(sequence) for sequence in sequences)

148 for sequence in sequences:

149 gaps_count = max_length - len(sequence)

150 project.alignment[sequence] = sequence.monomers + [None] * gaps_count

151 return project

152

153 def save_fasta(self, out_file, long_line=60):

154 """

155 Saves alignment to given file

156 Splits long lines to substrings of length=long_line

157 To prevent this, set long_line=None

158 """

159 for sequence in self.sequences:

160 out_file.write(">%(name)s %(description)s \n" % sequence.__dict__)

161 string = str(sequence)

162 if long_line:

163 for i in range(0, len(string) // long_line + 1):

164 out_file.write("%s \n" % string[i*long_line : i*long_line + long_line])

165 else:

166 out_file.write("%s \n" % string)

167

168 def muscle_align(self):

169 """

170 Simple align ths alignment using sequences (muscle)

171 uses old Monomers and Sequences objects

172 """

173 tmp_file, tmp_filename = mkstemp()

174 os.close(tmp_file) # this is file descriptor, not normal file object. security issue )

175 tmp_file = open(tmp_filename, 'w')

176 self.save_fasta(tmp_file)

177 tmp_file.close()

178 os.system("muscle -in %(tmp)s -out %(tmp)s" % {'tmp': tmp_filename})

179 sequences, alignment = Project.from_fasta(open(tmp_filename))

180 for sequence in self.sequences:

181 try:

182 new_sequence = [i for i in sequences if sequence==i][0]

183 except:

184 raise Exception("Align: Cann't find sequence %s in muscle output" % sequence.name)

185 old_monomers = sequence.monomers.__iter__()

186 self.alignment[sequence] = []

187 for monomer in alignment[new_sequence]:

188 if not monomer:

189 self.alignment[sequence].append(monomer)

190 else:

191 self.alignment[sequence].append(old_monomers.next())

192