allpy
diff allpy/fileio.py @ 724:fd531580b9af
Merge between Burkov and others.
author | Boris Burkov <BurkovBA@gmail.com> |
---|---|
date | Fri, 08 Jul 2011 16:20:20 +0400 |
parents | 80043822a41e |
children | d16e8559b6dd |
line diff
1.1 --- a/allpy/fileio.py Fri Jul 08 16:06:59 2011 +0400 1.2 +++ b/allpy/fileio.py Fri Jul 08 16:20:20 2011 +0400 1.3 @@ -3,21 +3,49 @@ 1.4 from tempfile import NamedTemporaryFile 1.5 import util 1.6 1.7 +def get_markups_class(classname): 1.8 + """This ugly helper is to avoid bad untimely import loops.""" 1.9 + import markups 1.10 + return getattr(markups, classname) 1.11 + 1.12 class File(object): 1.13 """Automatical file IO.""" 1.14 - def __new__(cls, file, format="fasta"): 1.15 + def __new__(cls, file, format="fasta", **kw): 1.16 if format == "fasta": 1.17 - return FastaFile(file) 1.18 + return FastaFile(file, **kw) 1.19 + elif format == 'markup': 1.20 + return MarkupFile(file, **kw) 1.21 + elif format.startswith('markup:'): 1.22 + subformat = format.split(':',1)[1] 1.23 + return MarkupFile(file, format=subformat, **kw) 1.24 else: 1.25 - return EmbossFile(file, format) 1.26 + return EmbossFile(file, format, **kw) 1.27 1.28 -class FastaFile(object): 1.29 +class AlignmentFile(object): 1.30 + """Some helpers.""" 1.31 + 1.32 + def __init__(self, file, format='fasta', gaps='-', wrap_column=70): 1.33 + self.file = file 1.34 + self.format = format 1.35 + self.gaps = gaps 1.36 + self.wrap_column = wrap_column 1.37 + 1.38 + def write_alignment(self, alignment): 1.39 + """Append alignment to the file.""" 1.40 + self.write_strings( 1.41 + (row, row.sequence.name, row.sequence.description) 1.42 + for row in alignment.rows_as_strings(self.gaps) 1.43 + ) 1.44 + 1.45 + def read_alignment(self, alignment): 1.46 + """Read alignment from the file.""" 1.47 + append_row = alignment.append_row_from_string 1.48 + for name, description, body in self.read_strings(): 1.49 + append_row(body, name, description, file.name, self.gaps) 1.50 + 1.51 +class FastaFile(AlignmentFile): 1.52 """Fasta parser & writer.""" 1.53 1.54 - def __init__(self, file, wrap_column=70): 1.55 - self.file = file 1.56 - self.wrap_column = wrap_column 1.57 - 1.58 def write_string(self, string, name, description=''): 1.59 """Append one sequence to file.""" 1.60 if description: 1.61 @@ -49,13 +77,141 @@ 1.62 body = util.remove_each(body, " \n\r\t\v") 1.63 yield (name, description, body) 1.64 1.65 -class EmbossFile(object): 1.66 +class MarkupFile(AlignmentFile): 1.67 + """Parser & writer for our own marked alignment file format. 1.68 + 1.69 + Marked alignment file consists of a list of records, separated with one or 1.70 + more empty lines. Each record consists of type name, header and optional 1.71 + contents. Type name is a line, containing just one word, describing the 1.72 + record type. Header is a sequence of lines, each in format `key: value`. 1.73 + Content, if present, is separated from header with an empty line. 1.74 + 1.75 + Type names and header key names are case-insensitive. 1.76 + 1.77 + Known record types now are: 1.78 + 1.79 + - `alignment` -- this must be the last record in file for now 1.80 + - `sequence_markup` 1.81 + - `alignment_markup` 1.82 + 1.83 + Example:: 1.84 + 1.85 + sequence_markup 1.86 + sequence_name: cyb5_mouse 1.87 + sequence_description: 1.88 + name: pdb_residue_number 1.89 + type: SequencePDBResidueNumberMarkup 1.90 + markup: -,12,121,122,123,124,13,14,15,-,-,16 1.91 + 1.92 + alignment_markup 1.93 + name: geometrical_core 1.94 + type: AlignmentGeometricalCoreMarkup 1.95 + markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,- 1.96 + 1.97 + alignment 1.98 + format: fasta 1.99 + 1.100 + > cyb5_mouse 1.101 + seqvencemouse 1.102 + """ 1.103 + 1.104 + _empty_line = '' 1.105 + """Helper attribute for write_empty_line.""" 1.106 + 1.107 + def write_alignment(self, alignment): 1.108 + """Write alignment to file.""" 1.109 + self.write_markups(alignment.markups, 'alignment_markup') 1.110 + for sequence in alignment.sequences: 1.111 + record = { 1.112 + 'sequence_name': sequence.name, 1.113 + 'sequence_description': sequence.description, 1.114 + } 1.115 + self.write_markups(sequence.markups, 'sequence_markup', record) 1.116 + record = {'type': 'alignment', 'format': self.format} 1.117 + self.write_record(record) 1.118 + self.write_empty_line() 1.119 + alignment.to_file(self.file) 1.120 + 1.121 + def write_markups(self, markups, type, pre_record={}): 1.122 + """Write a dictionary of markups as series of records.""" 1.123 + for name, markup in markups.items(): 1.124 + record = markup.to_record() 1.125 + record.update(pre_record) 1.126 + record['type'] = type 1.127 + record['name'] = name 1.128 + record['class'] = markup.__class__.__name__ 1.129 + self.write_record(record) 1.130 + 1.131 + def write_record(self, record): 1.132 + """Write record to file. Add new line before every but first record.""" 1.133 + self.write_empty_line() 1.134 + self.file.write('%s\n' % record['type']) 1.135 + del record['type'] 1.136 + for key, value in record.items(): 1.137 + self.file.write('%s: %s\n' % (key, value)) 1.138 + 1.139 + def write_empty_line(self): 1.140 + """Add empty line every time except the first call.""" 1.141 + self.file.write(self._empty_line) 1.142 + self._empty_line = '\n' 1.143 + 1.144 + def read_alignment(self, alignment): 1.145 + """Read alignment from file.""" 1.146 + for record in list(self.read_records(alignment)): 1.147 + handler = getattr(self, 'add_%s' % record['type']) 1.148 + handler(alignment, record) 1.149 + 1.150 + def add_sequence_markup(self, alignment, record): 1.151 + """Found sequence markup record in file. Do something about it.""" 1.152 + for sequence in alignment.sequences: 1.153 + if sequence.name == record['sequence_name']: 1.154 + description = record.get('sequence_description') 1.155 + if description: 1.156 + assert sequence.description == description 1.157 + cls = get_markups_class(record['class']) 1.158 + cls.from_record(sequence, record, name=record.get('name')) 1.159 + return 1.160 + raise AssertionError("Could not find sequence in alignment") 1.161 + 1.162 + def add_alignment_markup(self, alignment, record): 1.163 + """Found alignment markup record in file. Do something about it.""" 1.164 + cls = get_markups_class(record['class']) 1.165 + cls.from_record(alignment, record, name=record.get('name')) 1.166 + 1.167 + def add_alignment(self, alignment, record): 1.168 + """Found alignment record. It has been handled in read_payload.""" 1.169 + pass 1.170 + 1.171 + def read_records(self, alignment): 1.172 + """Read records and return them as a list of dicts.""" 1.173 + for line in self.file: 1.174 + if line.strip() == "": 1.175 + continue 1.176 + yield self.read_record(alignment, line) 1.177 + 1.178 + def read_record(self, alignment, type): 1.179 + """Read record headers and record payload.""" 1.180 + type = type.strip().lower() 1.181 + record = {'type': type} 1.182 + for line in self.file: 1.183 + if line.strip() == "": 1.184 + self.read_payload(alignment, record, type) 1.185 + return record 1.186 + key, value = line.split(':', 1) 1.187 + key = key.strip().lower() 1.188 + value = value.strip() 1.189 + record[key] = value 1.190 + return record 1.191 + 1.192 + def read_payload(self, alignment, record, type): 1.193 + """Read record payload, if necessary.""" 1.194 + if type == 'alignment': 1.195 + io = File(self.file, record.get('format', 'fasta')) 1.196 + io.read_alignment(alignment) 1.197 + 1.198 +class EmbossFile(AlignmentFile): 1.199 """Parser & writer for file formats supported by EMBOSS.""" 1.200 1.201 - def __init__(self, file, format): 1.202 - self.file = file 1.203 - self.format = format 1.204 - 1.205 def write_strings(self, sequences): 1.206 """Write sequences to file.""" 1.207 pipe = Popen(['seqret', 'stdin', '%s::stdout' % self.format],