allpy

diff allpy/fileio.py @ 873:691afd37a4b7
Manual merge between Burkov and central
author: Boris Burkov <BurkovBA@gmail.com>
date: Fri, 29 Jul 2011 19:09:22 +0400
parents: 6cc007e68af6
     1.1 --- a/allpy/fileio.py	Fri Jul 29 19:08:21 2011 +0400
     1.2 +++ b/allpy/fileio.py	Fri Jul 29 19:09:22 2011 +0400
     1.3 @@ -3,6 +3,13 @@
     1.4  from tempfile import NamedTemporaryFile
     1.5  import util
     1.6  
     1.7 +bio_python = False
     1.8 +try:
     1.9 +    from Bio import Seq, SeqRecord, Align, SeqIO, AlignIO, Alphabet
    1.10 +    bio_python = True
    1.11 +except ImportError:
    1.12 +    pass
    1.13 +
    1.14  def get_markups_class(classname):
    1.15      """This ugly helper is to avoid bad untimely import loops."""
    1.16      import markups
    1.17 @@ -18,6 +25,8 @@
    1.18          elif format.startswith('markup:'):
    1.19              subformat = format.split(':',1)[1]
    1.20              return MarkupFile(file, format=subformat, **kw)
    1.21 +        elif bio_python and BioPythonFile.supports(format):
    1.22 +            return BioPythonFile(file, format, **kw)
    1.23          else:
    1.24              return EmbossFile(file, format, **kw)
    1.25  
    1.26 @@ -40,8 +49,9 @@
    1.27      def read_alignment(self, alignment):
    1.28          """Read alignment from the file."""
    1.29          append_row = alignment.append_row_from_string
    1.30 +        source = getattr(self.file, 'name', '')
    1.31          for name, description, body in self.read_strings():
    1.32 -            append_row(body, name, description, file.name, self.gaps)
    1.33 +            append_row(body, name, description, source, self.gaps)
    1.34  
    1.35  class FastaFile(AlignmentFile):
    1.36      """Fasta parser & writer."""
    1.37 @@ -67,8 +77,24 @@
    1.38          for string, name, description in sequences:
    1.39              self.write_string(string, name, description)
    1.40  
    1.41 +    def read_parts(self):
    1.42 +        """Read parts beginning with > in FASTA file.
    1.43 +
    1.44 +        This is a drop-in replacement for self.file.read().split("\n>")
    1.45 +        It is required for markup format, which combines parts read with
    1.46 +        different parsers. Python prohibits combining iterators and file.read
    1.47 +        methods on the same file.
    1.48 +        """
    1.49 +        part = None
    1.50 +        for line in self.file:
    1.51 +            if line.startswith(">"):
    1.52 +                if part: yield part
    1.53 +                part = ""
    1.54 +            part += line
    1.55 +        if part: yield part
    1.56 +
    1.57      def read_strings(self):
    1.58 -        for part in self.file.read().split("\n>"):
    1.59 +        for part in self.read_parts():
    1.60              header, _, body = part.partition("\n")
    1.61              header = header.lstrip(">")
    1.62              name, _, description = header.partition(" ")
    1.63 @@ -86,7 +112,8 @@
    1.64      record type. Header is a sequence of lines, each in format `key: value`.
    1.65      Content, if present, is separated from header with an empty line.
    1.66  
    1.67 -    Type names and header key names are case-insensitive.
    1.68 +    Type names and header key names are case-insensitive and '-' and '_' in
    1.69 +    them are equivalent.
    1.70  
    1.71      Known record types now are:
    1.72  
    1.73 @@ -96,14 +123,14 @@
    1.74  
    1.75      Example::
    1.76  
    1.77 -        sequence_markup
    1.78 -        sequence_name: cyb5_mouse
    1.79 -        sequence_description:
    1.80 +        sequence-markup
    1.81 +        sequence-name: cyb5_mouse
    1.82 +        sequence-description:
    1.83          name: pdb_residue_number
    1.84          type: SequencePDBResidueNumberMarkup
    1.85          markup: -,12,121,122,123,124,13,14,15,-,-,16
    1.86  
    1.87 -        alignment_markup
    1.88 +        alignment-markup
    1.89          name: geometrical_core
    1.90          type: AlignmentGeometricalCoreMarkup
    1.91          markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,-
    1.92 @@ -130,11 +157,13 @@
    1.93          record = {'type': 'alignment', 'format': self.format}
    1.94          self.write_record(record)
    1.95          self.write_empty_line()
    1.96 -        alignment.to_file(self.file)
    1.97 +        alignment.to_file(self.file, format=self.format, gap=self.gaps)
    1.98  
    1.99      def write_markups(self, markups, type, pre_record={}):
   1.100          """Write a dictionary of markups as series of records."""
   1.101          for name, markup in markups.items():
   1.102 +            if not markup.save:
   1.103 +                continue
   1.104              record = markup.to_record()
   1.105              record.update(pre_record)
   1.106              record['type'] = type
   1.107 @@ -145,9 +174,10 @@
   1.108      def write_record(self, record):
   1.109          """Write record to file. Add new line before every but first record."""
   1.110          self.write_empty_line()
   1.111 -        self.file.write('%s\n' % record['type'])
   1.112 +        self.file.write('%s\n' % self.normalize('write', record['type']))
   1.113          del record['type']
   1.114          for key, value in record.items():
   1.115 +            key = self.normalize('write', key)
   1.116              self.file.write('%s: %s\n' % (key, value))
   1.117  
   1.118      def write_empty_line(self):
   1.119 @@ -191,14 +221,14 @@
   1.120  
   1.121      def read_record(self, alignment, type):
   1.122          """Read record headers and record payload."""
   1.123 -        type = type.strip().lower()
   1.124 +        type = self.normalize('read', type)
   1.125          record = {'type': type}
   1.126          for line in self.file:
   1.127              if line.strip() == "":
   1.128                  self.read_payload(alignment, record, type)
   1.129                  return record
   1.130              key, value = line.split(':', 1)
   1.131 -            key = key.strip().lower()
   1.132 +            key = self.normalize('read', key)
   1.133              value = value.strip()
   1.134              record[key] = value
   1.135          return record
   1.136 @@ -206,9 +236,44 @@
   1.137      def read_payload(self, alignment, record, type):
   1.138          """Read record payload, if necessary."""
   1.139          if type == 'alignment':
   1.140 -            io = File(self.file, record.get('format', 'fasta'))
   1.141 +            io = File(self.file, record.get('format', 'fasta'), gaps=self.gaps)
   1.142              io.read_alignment(alignment)
   1.143  
   1.144 +    @staticmethod
   1.145 +    def normalize(for_what, string):
   1.146 +        if for_what == 'read':
   1.147 +            return string.strip().replace('-', '_').lower()
   1.148 +        if for_what == 'write':
   1.149 +            return string.strip().replace('_', '-').capitalize()
   1.150 +
   1.151 +class BioPythonFile(AlignmentFile):
   1.152 +    """Parser & writer for file formats supporte by Bio python."""
   1.153 +
   1.154 +    @staticmethod
   1.155 +    def supports(format):
   1.156 +        """Tell what formats this method supports."""
   1.157 +        return (
   1.158 +            format in AlignIO._FormatToWriter
   1.159 +            or format in SeqIO._FormatToWriter
   1.160 +        )
   1.161 +
   1.162 +    def write_strings(self, sequences):
   1.163 +        """Write sequences to file."""
   1.164 +        aln = Align.MultipleSeqAlignment([
   1.165 +            SeqRecord.SeqRecord(
   1.166 +                Seq.Seq(body, Alphabet.single_letter_alphabet),
   1.167 +                id=name,
   1.168 +                description=description
   1.169 +            )
   1.170 +            for body, name, description in sequences
   1.171 +        ])
   1.172 +        AlignIO.write(aln, self.file, self.format)
   1.173 +
   1.174 +    def read_strings(self):
   1.175 +        """Read sequences from file."""
   1.176 +        for seq in AlignIO.read(self.file, self.format):
   1.177 +            yield seq.id, seq.description, str(seq.seq)
   1.178 +
   1.179  class EmbossFile(AlignmentFile):
   1.180      """Parser & writer for file formats supported by EMBOSS."""
   1.181
author	Boris Burkov <BurkovBA@gmail.com>
date	Fri, 29 Jul 2011 19:09:22 +0400
parents	6cc007e68af6
children