allpy
changeset 143:004c2f6c45ac
project.from_fasta improvements
author | boris <bnagaev@gmail.com> |
---|---|
date | Sun, 24 Oct 2010 19:05:31 +0400 |
parents | a0ff92b78b6a |
children | ea55c27e165f |
files | lib/project.py |
diffstat | 1 files changed, 31 insertions(+), 23 deletions(-) [+] |
line diff
1.1 --- a/lib/project.py Sun Oct 24 18:40:39 2010 +0400 1.2 +++ b/lib/project.py Sun Oct 24 19:05:31 2010 +0400 1.3 @@ -8,6 +8,7 @@ 1.4 """ 1.5 1.6 import sequence 1.7 +Sequence = sequence.Sequence 1.8 from monomer import AminoAcidType 1.9 import allpy_data 1.10 import os 1.11 @@ -103,40 +104,47 @@ 1.12 """ 1.13 import re 1.14 1.15 - sequences=[] 1.16 - alignment={} 1.17 + sequences = [] 1.18 + alignment = {} 1.19 1.20 - content=file.read() 1.21 - raw_sequences=content.split(">")[1:]#ignore everything before the first > 1.22 + raw_sequences = file.read().split(">") 1.23 + if len(raw_sequences) <= 1: 1.24 + raise "Wrong format of fasta-file %s" % file.name 1.25 + 1.26 + raw_sequences = raw_sequences[1:] #ignore everything before the first > 1.27 for raw in raw_sequences: 1.28 parsed_raw_sequence = raw.split("\n") 1.29 - for counter,piece in enumerate(parsed_raw_sequence): 1.30 - parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces 1.31 - name_and_description = parsed_raw_sequence[0] 1.32 - if len(name_and_description.split(" ",1))==2: 1.33 - name, description = name_and_description.split(" ",1) 1.34 - elif len(name_and_description.split(" ", 1)) == 1:#if there is description 1.35 - name = name_and_description 1.36 + parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence] 1.37 + name_and_description = parsed_raw_sequence[0] 1.38 + name_and_description = name_and_description.split(" ",1) 1.39 + if len(name_and_description) == 2: 1.40 + name, description = name_and_description 1.41 + elif len(name_and_description) == 1: 1.42 + #if there is description 1.43 + name = name_and_description[0] 1.44 + description = '' 1.45 else: 1.46 raise "Wrong name of sequence in fasta file" 1.47 - string="" 1.48 + 1.49 + if len(parsed_raw_sequence) <= 1: 1.50 + raise "Wrong format of sequence %(name)$ fasta-file %(file)s" % \ 1.51 + {'name': name, 'file': file.name} 1.52 + string = "" 1.53 for piece in parsed_raw_sequence[1:]: 1.54 - piece_without_whitespace_chars=re.sub("\s","",piece) 1.55 - string+=piece_without_whitespace_chars 1.56 - monomers=[]#convert into Monomer objects 1.57 - alignment_list=[]#create the respective list in alignment dict 1.58 + piece_without_whitespace_chars = re.sub("\s", "", piece) 1.59 + string += piece_without_whitespace_chars 1.60 + monomers = [] #convert into Monomer objects 1.61 + alignment_list = [] #create the respective list in alignment dict 1.62 for current_monomer in string: 1.63 - if current_monomer!="-" and current_monomer!="." and current_monomer!="~": 1.64 + if current_monomer not in ["-", ".", "~"]: 1.65 monomers.append(monomer_kind.from_code1(current_monomer).instance()) 1.66 alignment_list.append(monomers[-1]) 1.67 else: 1.68 alignment_list.append(None) 1.69 - if "description" in vars():#if there's no description 1.70 - sequences.append(sequence.Sequence(monomers,name,description)) 1.71 - else: 1.72 - sequences.append(sequence.Sequence(monomers,name)) 1.73 - alignment[sequences[-1]]=alignment_list 1.74 - return sequences,alignment 1.75 + sequence = Sequence(monomers, name, description) 1.76 + sequences.append(sequence) 1.77 + alignment[sequence] = alignment_list 1.78 + return sequences, alignment 1.79 1.80 1.81 @staticmethod