allpy

changeset 287:f8bd7c469fcf
Clean reimplementation of allpy.base.Alignment.from_fasta Also changed interface of fasta.parse_file Added allpy.util with currently one function unzip -- the reverse of zip builtin
author: Daniil Alexeyevsky <me.dendik@gmail.com>
date: Thu, 16 Dec 2010 01:12:06 +0300
parents: cf6cdc3b7ec5
children: 9c68d8eab8f5
files: allpy/base.py allpy/fasta.py allpy/util.py
diffstat: 3 files changed, 45 insertions(+), 77 deletions(-) [+]
[-]

allpy/base.py 104

allpy/fasta.py 7

allpy/util.py 11 allpy/base.py 104 allpy/fasta.py 7 allpy/util.py 11
allpy/base.py 104
allpy/fasta.py 7
     1.1 --- a/allpy/base.py	Wed Dec 15 23:45:11 2010 +0300
     1.2 +++ b/allpy/base.py	Thu Dec 16 01:12:06 2010 +0300
     1.3 @@ -181,41 +181,47 @@
     1.4          """
     1.5          sequences = fasta.parse_file(file)
     1.6          assert len(sequences) == 1
     1.7 -        header = sequences.keys()[0]
     1.8 -        name, _, description = header.partition(" ")
     1.9 +        name, description = sequences.keys()[0]
    1.10          return cls(sequences[header], name, description, file.name)
    1.11  
    1.12 -class Alignment(dict):
    1.13 +class Alignment(list):
    1.14      """Alignment.
    1.15  
    1.16      Behaves like a list of Columns.
    1.17      """
    1.18 -    # _sequences -- list of Sequence objects. Sequences don't contain gaps
    1.19 -    #  - see sequence.py module
    1.20  
    1.21 -    def __init__(self, *args):
    1.22 -        """overloaded constructor
    1.23 +    sequence_type = Sequence
    1.24 +    """Type of sequences to create in alignment.
    1.25 +    
    1.26 +    SHOULD be redefined when subclassing"""
    1.27  
    1.28 -        Alignment()
    1.29 -            new empty Alignment
    1.30 +    def __init__(self):
    1.31 +        """Initialize empty alignment."""
    1.32 +        super(Alignment, self).__init__()
    1.33  
    1.34 -        Alignment(sequences, body)
    1.35 -            new Alignment with sequences and body initialized from arguments
    1.36 +        self.sequences = []
    1.37 +        """Ordered list of sequences in alignment."""
    1.38  
    1.39 -        Alignment(fasta_file)
    1.40 -            new Alignment, read body and sequences from fasta file
    1.41 -        """
    1.42 -        if len(args)>1:#overloaded constructor
    1.43 -            self.sequences=args[0]
    1.44 -            self.body=args[1]
    1.45 -        elif len(args)==0:
    1.46 -            self.sequences=[]
    1.47 -            self.body={}
    1.48 -        else:
    1.49 -            self.sequences, self.body = Alignment.from_fasta(args[0])
    1.50 +    def add_gapped_line(self, line, name='', description='', source=''):
    1.51 +        """Add row from a line of one-letter codes and gaps."""
    1.52 +        Sequence = cls.sequence_type
    1.53 +        not_gap = lambda (i, char): char != "-"
    1.54 +        no_gaps = line.replace("-", "")
    1.55 +        sequence = Sequence(no_gaps, name, description, source)
    1.56 +        for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):
    1.57 +            self[j][seq] = sequence[i]
    1.58 +        self.sequences.append(sequence)
    1.59 +
    1.60 +    @classmethod
    1.61 +    def from_fasta(cls, file):
    1.62 +        """Create new alignment from FASTA file."""
    1.63 +        self = cls()
    1.64 +        for ((name, description), body) in fasta.parse_file(file):
    1.65 +            self.add_gapped_line(body, name, description)
    1.66 +        return self
    1.67  
    1.68      def length(self):
    1.69 -        """ Returns width, ie length of each sequence with gaps """
    1.70 +        """Return width, ie length of each sequence with gaps."""
    1.71          return max([len(line) for line in self.body.values()])
    1.72  
    1.73      def height(self):
    1.74 @@ -261,58 +267,6 @@
    1.75                  line.append(all_columns[position].get(aa))
    1.76          return self.identity_percentages
    1.77  
    1.78 -    @classmethod
    1.79 -    def from_fasta(file):
    1.80 -        """ Import data from fasta file
    1.81 -
    1.82 -        >>> import alignment
    1.83 -        >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
    1.84 -        """
    1.85 -        import re
    1.86 -
    1.87 -        sequences = []
    1.88 -        body = {}
    1.89 -
    1.90 -        raw_sequences = file.read().split(">")
    1.91 -        if len(raw_sequences) <= 1:
    1.92 -            raise Exception("Wrong format of fasta-file %s" % file.name)
    1.93 -
    1.94 -        raw_sequences = raw_sequences[1:] #ignore everything before the first >
    1.95 -        for raw in raw_sequences:
    1.96 -            parsed_raw_sequence = raw.split("\n")
    1.97 -            parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
    1.98 -            name_and_description = parsed_raw_sequence[0]
    1.99 -            name_and_description = name_and_description.split(" ",1)
   1.100 -            if len(name_and_description) == 2:
   1.101 -                name, description = name_and_description
   1.102 -            elif len(name_and_description) == 1:
   1.103 -                #if there is description
   1.104 -                name = name_and_description[0]
   1.105 -                description = ''
   1.106 -            else:
   1.107 -                raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
   1.108 -                {'name': name, 'file': file.name})
   1.109 -
   1.110 -            if len(parsed_raw_sequence) <= 1:
   1.111 -                raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
   1.112 -                {'name': name, 'file': file.name})
   1.113 -            string = ""
   1.114 -            for piece in parsed_raw_sequence[1:]:
   1.115 -                piece_without_whitespace_chars = re.sub("\s", "", piece)
   1.116 -                string += piece_without_whitespace_chars
   1.117 -            monomers = [] #convert into Monomer objects
   1.118 -            body_list = [] #create the respective list in body dict
   1.119 -            for current_monomer in string:
   1.120 -                if current_monomer not in ["-", ".", "~"]:
   1.121 -                    monomers.append(cls.monomer_type.from_code1(current_monomer))
   1.122 -                    body_list.append(monomers[-1])
   1.123 -                else:
   1.124 -                    body_list.append(None)
   1.125 -            s = sequence.Sequence(monomers, name, description)
   1.126 -            sequences.append(s)
   1.127 -            body[s] = body_list
   1.128 -        return sequences, body
   1.129 -
   1.130      @staticmethod
   1.131      def from_sequences(*sequences):
   1.132          """  Constructs new alignment from sequences

     2.1 --- a/allpy/fasta.py	Wed Dec 15 23:45:11 2010 +0300
     2.2 +++ b/allpy/fasta.py	Thu Dec 16 01:12:06 2010 +0300
     2.3 @@ -1,14 +1,17 @@
     2.4  def parse_fasta(file):
     2.5      """Parse fasta file, remove spaces and newlines from sequence bodies.
     2.6  
     2.7 -    Return a dict of { sequence header: sequence body }.
     2.8 +    Return a dict of { (name, description) : sequence_body }.
     2.9      """
    2.10      sequences = {}
    2.11      for part in file.read().split(">"):
    2.12          header, _, body = part.partition("\n")
    2.13          header = header.lstrip(">").strip()
    2.14 +        name, _, description = header.partition(" ")
    2.15 +        name = name.strip()
    2.16 +        description = description.strip()
    2.17          body = body.replace(" ", "").replace("\n", "")
    2.18 -        sequences[header] = body
    2.19 +        sequences[name, description] = body
    2.20      return sequences
    2.21  
    2.22  def save_fasta(out_file, string, name, description='', long_line=70):

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/allpy/util.py	Thu Dec 16 01:12:06 2010 +0300
     3.3 @@ -0,0 +1,11 @@
     3.4 +"""Miscellanous utilities.
     3.5 +"""
     3.6 +
     3.7 +def unzip(seq):
     3.8 +    a, b = [], []
     3.9 +    for x, y in seq:
    3.10 +        a.append(x)
    3.11 +        b.append(y)
    3.12 +    return a, b
    3.13 +
    3.14 +# vim: set et ts=4 sts=4 sw=4: