allpy

diff allpy/fileio.py @ 711:21cfc7897a8f
Implemented markup fileIO (closes #56) This is done by adding file format 'markup' or 'markup:formatname', where 'formatname' is otherwise known alignment format. The file format for is described briefly in fileio.MarkupFile docstrings. This commit also contains example of defining Markup saving mixin: markups.IntMarkupMixin and a test for it.
author: Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date: Thu, 07 Jul 2011 22:32:21 +0400
parents: 80043822a41e
children: d16e8559b6dd
     1.1 --- a/allpy/fileio.py	Thu Jul 07 22:27:14 2011 +0400
     1.2 +++ b/allpy/fileio.py	Thu Jul 07 22:32:21 2011 +0400
     1.3 @@ -3,11 +3,21 @@
     1.4  from tempfile import NamedTemporaryFile
     1.5  import util
     1.6  
     1.7 +def get_markups_class(classname):
     1.8 +    """This ugly helper is to avoid bad untimely import loops."""
     1.9 +    import markups
    1.10 +    return getattr(markups, classname)
    1.11 +
    1.12  class File(object):
    1.13      """Automatical file IO."""
    1.14      def __new__(cls, file, format="fasta", **kw):
    1.15          if format == "fasta":
    1.16              return FastaFile(file, **kw)
    1.17 +        elif format == 'markup':
    1.18 +            return MarkupFile(file, **kw)
    1.19 +        elif format.startswith('markup:'):
    1.20 +            subformat = format.split(':',1)[1]
    1.21 +            return MarkupFile(file, format=subformat, **kw)
    1.22          else:
    1.23              return EmbossFile(file, format, **kw)
    1.24  
    1.25 @@ -67,6 +77,138 @@
    1.26              body = util.remove_each(body, " \n\r\t\v")
    1.27              yield (name, description, body)
    1.28  
    1.29 +class MarkupFile(AlignmentFile):
    1.30 +    """Parser & writer for our own marked alignment file format.
    1.31 +
    1.32 +    Marked alignment file consists of a list of records, separated with one or
    1.33 +    more empty lines. Each record consists of type name, header and optional
    1.34 +    contents. Type name is a line, containing just one word, describing the
    1.35 +    record type. Header is a sequence of lines, each in format `key: value`.
    1.36 +    Content, if present, is separated from header with an empty line.
    1.37 +
    1.38 +    Type names and header key names are case-insensitive.
    1.39 +
    1.40 +    Known record types now are:
    1.41 +
    1.42 +    - `alignment` -- this must be the last record in file for now
    1.43 +    - `sequence_markup`
    1.44 +    - `alignment_markup`
    1.45 +
    1.46 +    Example::
    1.47 +
    1.48 +        sequence_markup
    1.49 +        sequence_name: cyb5_mouse
    1.50 +        sequence_description:
    1.51 +        name: pdb_residue_number
    1.52 +        type: SequencePDBResidueNumberMarkup
    1.53 +        markup: -,12,121,122,123,124,13,14,15,-,-,16
    1.54 +
    1.55 +        alignment_markup
    1.56 +        name: geometrical_core
    1.57 +        type: AlignmentGeometricalCoreMarkup
    1.58 +        markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,-
    1.59 +
    1.60 +        alignment
    1.61 +        format: fasta
    1.62 +
    1.63 +        > cyb5_mouse
    1.64 +        seqvencemouse
    1.65 +    """
    1.66 +
    1.67 +    _empty_line = ''
    1.68 +    """Helper attribute for write_empty_line."""
    1.69 +
    1.70 +    def write_alignment(self, alignment):
    1.71 +        """Write alignment to file."""
    1.72 +        self.write_markups(alignment.markups, 'alignment_markup')
    1.73 +        for sequence in alignment.sequences:
    1.74 +            record = {
    1.75 +                'sequence_name': sequence.name,
    1.76 +                'sequence_description': sequence.description,
    1.77 +            }
    1.78 +            self.write_markups(sequence.markups, 'sequence_markup', record)
    1.79 +        record = {'type': 'alignment', 'format': self.format}
    1.80 +        self.write_record(record)
    1.81 +        self.write_empty_line()
    1.82 +        alignment.to_file(self.file)
    1.83 +
    1.84 +    def write_markups(self, markups, type, pre_record={}):
    1.85 +        """Write a dictionary of markups as series of records."""
    1.86 +        for name, markup in markups.items():
    1.87 +            record = markup.to_record()
    1.88 +            record.update(pre_record)
    1.89 +            record['type'] = type
    1.90 +            record['name'] = name
    1.91 +            record['class'] = markup.__class__.__name__
    1.92 +            self.write_record(record)
    1.93 +
    1.94 +    def write_record(self, record):
    1.95 +        """Write record to file. Add new line before every but first record."""
    1.96 +        self.write_empty_line()
    1.97 +        self.file.write('%s\n' % record['type'])
    1.98 +        del record['type']
    1.99 +        for key, value in record.items():
   1.100 +            self.file.write('%s: %s\n' % (key, value))
   1.101 +
   1.102 +    def write_empty_line(self):
   1.103 +        """Add empty line every time except the first call."""
   1.104 +        self.file.write(self._empty_line)
   1.105 +        self._empty_line = '\n'
   1.106 +
   1.107 +    def read_alignment(self, alignment):
   1.108 +        """Read alignment from file."""
   1.109 +        for record in list(self.read_records(alignment)):
   1.110 +            handler = getattr(self, 'add_%s' % record['type'])
   1.111 +            handler(alignment, record)
   1.112 +
   1.113 +    def add_sequence_markup(self, alignment, record):
   1.114 +        """Found sequence markup record in file. Do something about it."""
   1.115 +        for sequence in alignment.sequences:
   1.116 +            if sequence.name == record['sequence_name']:
   1.117 +                description = record.get('sequence_description')
   1.118 +                if description:
   1.119 +                    assert sequence.description == description
   1.120 +                cls = get_markups_class(record['class'])
   1.121 +                cls.from_record(sequence, record, name=record.get('name'))
   1.122 +                return
   1.123 +        raise AssertionError("Could not find sequence in alignment")
   1.124 +
   1.125 +    def add_alignment_markup(self, alignment, record):
   1.126 +        """Found alignment markup record in file. Do something about it."""
   1.127 +        cls = get_markups_class(record['class'])
   1.128 +        cls.from_record(alignment, record, name=record.get('name'))
   1.129 +
   1.130 +    def add_alignment(self, alignment, record):
   1.131 +        """Found alignment record. It has been handled in read_payload."""
   1.132 +        pass
   1.133 +
   1.134 +    def read_records(self, alignment):
   1.135 +        """Read records and return them as a list of dicts."""
   1.136 +        for line in self.file:
   1.137 +            if line.strip() == "":
   1.138 +                continue
   1.139 +            yield self.read_record(alignment, line)
   1.140 +
   1.141 +    def read_record(self, alignment, type):
   1.142 +        """Read record headers and record payload."""
   1.143 +        type = type.strip().lower()
   1.144 +        record = {'type': type}
   1.145 +        for line in self.file:
   1.146 +            if line.strip() == "":
   1.147 +                self.read_payload(alignment, record, type)
   1.148 +                return record
   1.149 +            key, value = line.split(':', 1)
   1.150 +            key = key.strip().lower()
   1.151 +            value = value.strip()
   1.152 +            record[key] = value
   1.153 +        return record
   1.154 +
   1.155 +    def read_payload(self, alignment, record, type):
   1.156 +        """Read record payload, if necessary."""
   1.157 +        if type == 'alignment':
   1.158 +            io = File(self.file, record.get('format', 'fasta'))
   1.159 +            io.read_alignment(alignment)
   1.160 +
   1.161  class EmbossFile(AlignmentFile):
   1.162      """Parser & writer for file formats supported by EMBOSS."""
   1.163
author	Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date	Thu, 07 Jul 2011 22:32:21 +0400
parents	80043822a41e
children	d16e8559b6dd