allpy: c0689204093e lib/project.py

allpy

view lib/project.py @ 91:c0689204093e

from_fasta with new monomer class and monomer_type

author	boris <bnagaev@gmail.com>
date	Wed, 20 Oct 2010 10:05:43 +0400
parents	ea3113da42ca
children	581c1f71c4ea

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 import monomer

12 from monomer import MonomerType

13 import allpy_data

15 class Project(object):

16 """

17 Mandatory data:

18 * sequences -- list of Sequence objects. Sequences don't contain gaps

19 - see sequence.py module

20 * alignment -- dict

21 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

22 keys are the Sequence objects, values are the lists, which

23 contain monomers of those sequences or None for gaps in the

24 corresponding sequence of

25 alignment

27 """

28 def __init__(self, *args):

29 """overloaded constructor

31 Project() -> new empty Project

32 Project(sequences, alignment) -> new Project with sequences and

33 alignment initialized from arguments

34 Project(fasta_file) -> new Project, read alignment and sequences

35 from fasta file

37 """

38 if len(args)>1:#overloaded constructor

39 self.sequences=args[0]

40 self.alignment=args[1]

41 elif len(args)==0:

42 self.sequences=[]

43 self.alignment={}

44 else:

45 self.sequences,self.alignment=Project.from_fasta(args[0])

47 def __len__(self):

48 return max([len(line) for line in self.alignment.values()])

50 def thickness(self):

51 """ The number of sequences in alignment (it's thickness).

52 """

53 return len(self.alignment)

55 def calc_identity(self):

56 """ Calculate the identity of alignment positions for colouring.

58 For every (row, column) in alignment the percentage of the exactly

59 same residue in the same column in the alignment is calculated.

60 The data structure is just like the Project.alignment, but istead of

61 monomers it contains float percentages.

62 """

63 # Oh, God, that's awful! Absolutely not understandable.

64 # First, calculate percentages of amino acids in every column

65 contribution = 1.0 / len(self.sequences)

66 all_columns = []

67 for position in range(len(self)):

68 column_percentage = {}

69 for seq in self.alignment:

70 if self.alignment[seq][position] is not None:

71 aa = self.alignment[seq][position].code

72 else:

73 aa = None

74 if aa in allpy_data.amino_acids:

75 if aa in column_percentage.keys():

76 column_percentage[aa] += contribution

77 else:

78 column_percentage[aa] = contribution

79 all_columns.append(column_percentage)

80 # Second, map these percentages onto the alignment

81 self.identity_percentages = {}

82 for seq in self.sequences:

83 self.identity_percentages[seq] = []

84 for seq in self.identity_percentages:

85 line = self.identity_percentages[seq]

86 for position in range(len(self)):

87 if self.alignment[seq][position] is not None:

88 aa = self.alignment[seq][position].code

89 else:

90 aa = None

91 line.append(all_columns[position].get(aa))

92 return self.identity_percentages

94 @staticmethod

95 def from_fasta(file):

96 """

97 >>> import project

98 >>> sequences,alignment=project.Project.from_fasta(open("test.fasta"))

99 """

100 import re

101

102 sequences=[]

103 alignment={}

104

105 content=file.read()

106 raw_sequences=content.split(">")[1:]#ignore everything before the first >

107 for raw in raw_sequences:

108 parsed_raw_sequence = raw.split("\n")

109 for counter,piece in enumerate(parsed_raw_sequence):

110 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

111 name_and_description = parsed_raw_sequence[0]

112 if len(name_and_description.split(" ",1))==2:

113 name,description=name_and_description.split(" ",1)

114 elif len(name_and_description.split(" ",1))==1:#if there is description

115 name=name_and_description

116 else:

117 raise "Wrong name of sequence in fasta file"

118 string=""

119 for piece in parsed_raw_sequence[1:]:

120 piece_without_whitespace_chars=re.sub("\s","",piece)

121 string+=piece_without_whitespace_chars

122 monomers=[]#convert into Monomer objects

123 alignment_list=[]#create the respective list in alignment dict

124 for current_monomer in string:

125 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

126 monomer_type = MonomerType.from_code1(current_monomer)

127 monomers.append(monomer.Monomer(monomer_type))

128 alignment_list.append(monomers[-1])

129 else:

130 alignment_list.append(None)

131 if "description" in vars():#if there's no description

132 sequences.append(sequence.Sequence(name,monomers,description))

133 else:

134 sequences.append(sequence.Sequence(name,monomers))

135 alignment[sequences[-1]]=alignment_list

136 return sequences,alignment