allpy: 76816df24fe2 lib/project.py

allpy

view lib/project.py @ 5:76816df24fe2

dummy_pytale.py: file added Can do: display an alignment, colour by identity. Features: Both sequence names and sequences are TextCtrl. Extremely slow.

author	grishin@gorilla
date	Wed, 09 Jun 2010 14:46:02 +0400
parents	4ed6440f4f70
children	a185e7e255b3

line source

1 #!/usr/bin/python

3 """

4 "I will not use abbrev."

5 "I will always finish what I st"

6 - Bart Simpson

8 """

10 import sequence

11 import monomer

12 import allpy_data

14 class Project(object):

15 """

16 Mandatory data:

17 * sequences -- list of Sequence objects. Sequences don't contain gaps

18 - see sequence.py module

19 * alignment -- dict

20 {<Sequence object>:[<Monomer object>,None,<Monomer object>]}

21 keys are the Sequence objects, values are the lists, which

22 contain monomers of those sequences or None for gaps in the

23 corresponding sequence of

24 alignment

26 """

27 def __init__(self, *args):

28 """overloaded constructor

30 Project() -> new empty Project

31 Project(sequences, alignment) -> new Project with sequences and

32 alignment initialized from arguments

33 Project(fasta_file) -> new Project, read alignment and sequences

34 from fasta file

36 """

37 if len(args)>1:#overloaded constructor

38 self.sequences=args[0]

39 self.alignment=args[1]

40 elif len(args)==0:

41 self.sequences=[]

42 self.alignment={}

43 else:

44 self.sequences,self.alignment=Project.get_from_fasta(args[0])

46 def __len__(self):

47 return max([len(line) for line in self.alignment.values()])

49 def calc_identity(self):

50 """ Calculate the identity of alignment positions for colouring.

52 For every (row, column) in alignment the percentage of the exactly

53 same residue in the same column in the alignment is calculated.

54 The data structure is just like the Project.alignment, but istead of

55 monomers it contains float percentages.

56 """

57 # Oh, God, that's awful! Absolutely not understandable.

58 # First, calculate percentages of amino acids in every column

59 contribution = 1.0 / len(self.sequences)

60 all_columns = []

61 for position in range(len(self)):

62 column_percentage = {}

63 for seq in self.alignment:

64 if self.alignment[seq][position] is not None:

65 aa = self.alignment[seq][position].code

66 else:

67 aa = None

68 if aa in allpy_data.amino_acids:

69 if aa in column_percentage.keys():

70 column_percentage[aa] += contribution

71 else:

72 column_percentage[aa] = contribution

73 all_columns.append(column_percentage)

74 # Second, map these percentages onto the alignment

75 self.identity_percentages = {}

76 for seq in self.sequences:

77 self.identity_percentages[seq] = []

78 for seq in self.identity_percentages:

79 line = self.identity_percentages[seq]

80 for position in range(len(self)):

81 if self.alignment[seq][position] is not None:

82 aa = self.alignment[seq][position].code

83 else:

84 aa = None

85 line.append(all_columns[position].get(aa))

86 return self.identity_percentages

88 @staticmethod

89 def get_from_fasta(file):

90 """

91 >>> import project

92 >>> sequences,alignment=project.Project.get_from_fasta(open("test.fasta"))

93 """

94 import re

96 sequences=[]

97 alignment={}

99 content=file.read()

100 raw_sequences=content.split(">")[1:]#ignore everything before the first >

101 for raw in raw_sequences:

102 parsed_raw_sequence = raw.split("\n")

103 for counter,piece in enumerate(parsed_raw_sequence):

104 parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces

105 name_and_description = parsed_raw_sequence[0]

106 if len(name_and_description.split(" ",1))==2:

107 name,description=name_and_description.split(" ",1)

108 elif len(name_and_description.split(" ",1))==1:#if there is description

109 name=name_and_description

110 else:

111 raise "Wrong name of sequence in fasta file"

112 string=""

113 for piece in parsed_raw_sequence[1:]:

114 piece_without_whitespace_chars=re.sub("\s","",piece)

115 string+=piece_without_whitespace_chars

116 monomers=[]#convert into Monomer objects

117 alignment_list=[]#create the respective list in alignment dict

118 for current_monomer in string:

119 if current_monomer!="-" and current_monomer!="." and current_monomer!="~":

120 monomers.append(monomer.Monomer(current_monomer))

121 alignment_list.append(monomers[-1])

122 else:

123 alignment_list.append(None)

124 if "description" in vars():#if there's no description

125 sequences.append(sequence.Sequence(name,description,monomers))

126 else:

127 sequences.append(sequence.Sequence(name,None,monomers))

128 alignment[sequences[-1]]=alignment_list

129 return sequences,alignment