allpy

changeset 143:004c2f6c45ac
project.from_fasta improvements
author: boris <bnagaev@gmail.com>
date: Sun, 24 Oct 2010 19:05:31 +0400
parents: a0ff92b78b6a
children: ea55c27e165f
files: lib/project.py
diffstat: 1 files changed, 31 insertions(+), 23 deletions(-) [+]
[-]

lib/project.py 54 lib/project.py 54
lib/project.py 54
     1.1 --- a/lib/project.py	Sun Oct 24 18:40:39 2010 +0400
     1.2 +++ b/lib/project.py	Sun Oct 24 19:05:31 2010 +0400
     1.3 @@ -8,6 +8,7 @@
     1.4  """
     1.5  
     1.6  import sequence
     1.7 +Sequence = sequence.Sequence
     1.8  from monomer import AminoAcidType
     1.9  import allpy_data
    1.10  import os
    1.11 @@ -103,40 +104,47 @@
    1.12          """
    1.13          import re
    1.14  
    1.15 -        sequences=[]
    1.16 -        alignment={}
    1.17 +        sequences = []
    1.18 +        alignment = {}
    1.19  
    1.20 -        content=file.read()
    1.21 -        raw_sequences=content.split(">")[1:]#ignore everything before the first >
    1.22 +        raw_sequences = file.read().split(">")
    1.23 +        if len(raw_sequences) <= 1:
    1.24 +            raise "Wrong format of fasta-file %s" % file.name
    1.25 +        
    1.26 +        raw_sequences = raw_sequences[1:] #ignore everything before the first >
    1.27          for raw in raw_sequences:
    1.28              parsed_raw_sequence = raw.split("\n")
    1.29 -            for counter,piece in enumerate(parsed_raw_sequence):
    1.30 -                parsed_raw_sequence[counter]=piece.strip()#cut \r or whitespaces
    1.31 -            name_and_description = parsed_raw_sequence[0] 
    1.32 -            if len(name_and_description.split(" ",1))==2:
    1.33 -                name, description = name_and_description.split(" ",1)
    1.34 -            elif len(name_and_description.split(" ", 1)) == 1:#if there is description
    1.35 -                name = name_and_description
    1.36 +            parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
    1.37 +            name_and_description = parsed_raw_sequence[0]
    1.38 +            name_and_description = name_and_description.split(" ",1)
    1.39 +            if len(name_and_description) == 2:
    1.40 +                name, description = name_and_description
    1.41 +            elif len(name_and_description) == 1: 
    1.42 +                #if there is description
    1.43 +                name = name_and_description[0]
    1.44 +                description = ''
    1.45              else:
    1.46                  raise "Wrong name of sequence in fasta file"
    1.47 -            string=""
    1.48 +            
    1.49 +            if len(parsed_raw_sequence) <= 1:
    1.50 +                raise "Wrong format of sequence %(name)$ fasta-file %(file)s" % \
    1.51 +                {'name': name, 'file': file.name}
    1.52 +            string = ""
    1.53              for piece in parsed_raw_sequence[1:]:
    1.54 -                piece_without_whitespace_chars=re.sub("\s","",piece)
    1.55 -                string+=piece_without_whitespace_chars
    1.56 -            monomers=[]#convert into Monomer objects
    1.57 -            alignment_list=[]#create the respective list in alignment dict
    1.58 +                piece_without_whitespace_chars = re.sub("\s", "", piece)
    1.59 +                string += piece_without_whitespace_chars
    1.60 +            monomers = [] #convert into Monomer objects
    1.61 +            alignment_list = [] #create the respective list in alignment dict
    1.62              for current_monomer in string:
    1.63 -                if current_monomer!="-" and current_monomer!="." and current_monomer!="~":
    1.64 +                if current_monomer not in ["-", ".", "~"]:
    1.65                      monomers.append(monomer_kind.from_code1(current_monomer).instance())
    1.66                      alignment_list.append(monomers[-1])
    1.67                  else:
    1.68                      alignment_list.append(None)
    1.69 -            if "description" in vars():#if there's no description
    1.70 -                sequences.append(sequence.Sequence(monomers,name,description))
    1.71 -            else:
    1.72 -                sequences.append(sequence.Sequence(monomers,name))
    1.73 -            alignment[sequences[-1]]=alignment_list
    1.74 -        return sequences,alignment
    1.75 +            sequence = Sequence(monomers, name, description)
    1.76 +            sequences.append(sequence)
    1.77 +            alignment[sequence] = alignment_list
    1.78 +        return sequences, alignment
    1.79      
    1.80      
    1.81      @staticmethod