allpy
diff allpy/fileio.py @ 873:691afd37a4b7
Manual merge between Burkov and central
author | Boris Burkov <BurkovBA@gmail.com> |
---|---|
date | Fri, 29 Jul 2011 19:09:22 +0400 |
parents | 6cc007e68af6 |
children |
line diff
1.1 --- a/allpy/fileio.py Fri Jul 29 19:08:21 2011 +0400 1.2 +++ b/allpy/fileio.py Fri Jul 29 19:09:22 2011 +0400 1.3 @@ -3,6 +3,13 @@ 1.4 from tempfile import NamedTemporaryFile 1.5 import util 1.6 1.7 +bio_python = False 1.8 +try: 1.9 + from Bio import Seq, SeqRecord, Align, SeqIO, AlignIO, Alphabet 1.10 + bio_python = True 1.11 +except ImportError: 1.12 + pass 1.13 + 1.14 def get_markups_class(classname): 1.15 """This ugly helper is to avoid bad untimely import loops.""" 1.16 import markups 1.17 @@ -18,6 +25,8 @@ 1.18 elif format.startswith('markup:'): 1.19 subformat = format.split(':',1)[1] 1.20 return MarkupFile(file, format=subformat, **kw) 1.21 + elif bio_python and BioPythonFile.supports(format): 1.22 + return BioPythonFile(file, format, **kw) 1.23 else: 1.24 return EmbossFile(file, format, **kw) 1.25 1.26 @@ -40,8 +49,9 @@ 1.27 def read_alignment(self, alignment): 1.28 """Read alignment from the file.""" 1.29 append_row = alignment.append_row_from_string 1.30 + source = getattr(self.file, 'name', '') 1.31 for name, description, body in self.read_strings(): 1.32 - append_row(body, name, description, file.name, self.gaps) 1.33 + append_row(body, name, description, source, self.gaps) 1.34 1.35 class FastaFile(AlignmentFile): 1.36 """Fasta parser & writer.""" 1.37 @@ -67,8 +77,24 @@ 1.38 for string, name, description in sequences: 1.39 self.write_string(string, name, description) 1.40 1.41 + def read_parts(self): 1.42 + """Read parts beginning with > in FASTA file. 1.43 + 1.44 + This is a drop-in replacement for self.file.read().split("\n>") 1.45 + It is required for markup format, which combines parts read with 1.46 + different parsers. Python prohibits combining iterators and file.read 1.47 + methods on the same file. 1.48 + """ 1.49 + part = None 1.50 + for line in self.file: 1.51 + if line.startswith(">"): 1.52 + if part: yield part 1.53 + part = "" 1.54 + part += line 1.55 + if part: yield part 1.56 + 1.57 def read_strings(self): 1.58 - for part in self.file.read().split("\n>"): 1.59 + for part in self.read_parts(): 1.60 header, _, body = part.partition("\n") 1.61 header = header.lstrip(">") 1.62 name, _, description = header.partition(" ") 1.63 @@ -86,7 +112,8 @@ 1.64 record type. Header is a sequence of lines, each in format `key: value`. 1.65 Content, if present, is separated from header with an empty line. 1.66 1.67 - Type names and header key names are case-insensitive. 1.68 + Type names and header key names are case-insensitive and '-' and '_' in 1.69 + them are equivalent. 1.70 1.71 Known record types now are: 1.72 1.73 @@ -96,14 +123,14 @@ 1.74 1.75 Example:: 1.76 1.77 - sequence_markup 1.78 - sequence_name: cyb5_mouse 1.79 - sequence_description: 1.80 + sequence-markup 1.81 + sequence-name: cyb5_mouse 1.82 + sequence-description: 1.83 name: pdb_residue_number 1.84 type: SequencePDBResidueNumberMarkup 1.85 markup: -,12,121,122,123,124,13,14,15,-,-,16 1.86 1.87 - alignment_markup 1.88 + alignment-markup 1.89 name: geometrical_core 1.90 type: AlignmentGeometricalCoreMarkup 1.91 markup: -,-,-,-,+,+,+,-,-,-,+,+,-,-,-,- 1.92 @@ -130,11 +157,13 @@ 1.93 record = {'type': 'alignment', 'format': self.format} 1.94 self.write_record(record) 1.95 self.write_empty_line() 1.96 - alignment.to_file(self.file) 1.97 + alignment.to_file(self.file, format=self.format, gap=self.gaps) 1.98 1.99 def write_markups(self, markups, type, pre_record={}): 1.100 """Write a dictionary of markups as series of records.""" 1.101 for name, markup in markups.items(): 1.102 + if not markup.save: 1.103 + continue 1.104 record = markup.to_record() 1.105 record.update(pre_record) 1.106 record['type'] = type 1.107 @@ -145,9 +174,10 @@ 1.108 def write_record(self, record): 1.109 """Write record to file. Add new line before every but first record.""" 1.110 self.write_empty_line() 1.111 - self.file.write('%s\n' % record['type']) 1.112 + self.file.write('%s\n' % self.normalize('write', record['type'])) 1.113 del record['type'] 1.114 for key, value in record.items(): 1.115 + key = self.normalize('write', key) 1.116 self.file.write('%s: %s\n' % (key, value)) 1.117 1.118 def write_empty_line(self): 1.119 @@ -191,14 +221,14 @@ 1.120 1.121 def read_record(self, alignment, type): 1.122 """Read record headers and record payload.""" 1.123 - type = type.strip().lower() 1.124 + type = self.normalize('read', type) 1.125 record = {'type': type} 1.126 for line in self.file: 1.127 if line.strip() == "": 1.128 self.read_payload(alignment, record, type) 1.129 return record 1.130 key, value = line.split(':', 1) 1.131 - key = key.strip().lower() 1.132 + key = self.normalize('read', key) 1.133 value = value.strip() 1.134 record[key] = value 1.135 return record 1.136 @@ -206,9 +236,44 @@ 1.137 def read_payload(self, alignment, record, type): 1.138 """Read record payload, if necessary.""" 1.139 if type == 'alignment': 1.140 - io = File(self.file, record.get('format', 'fasta')) 1.141 + io = File(self.file, record.get('format', 'fasta'), gaps=self.gaps) 1.142 io.read_alignment(alignment) 1.143 1.144 + @staticmethod 1.145 + def normalize(for_what, string): 1.146 + if for_what == 'read': 1.147 + return string.strip().replace('-', '_').lower() 1.148 + if for_what == 'write': 1.149 + return string.strip().replace('_', '-').capitalize() 1.150 + 1.151 +class BioPythonFile(AlignmentFile): 1.152 + """Parser & writer for file formats supporte by Bio python.""" 1.153 + 1.154 + @staticmethod 1.155 + def supports(format): 1.156 + """Tell what formats this method supports.""" 1.157 + return ( 1.158 + format in AlignIO._FormatToWriter 1.159 + or format in SeqIO._FormatToWriter 1.160 + ) 1.161 + 1.162 + def write_strings(self, sequences): 1.163 + """Write sequences to file.""" 1.164 + aln = Align.MultipleSeqAlignment([ 1.165 + SeqRecord.SeqRecord( 1.166 + Seq.Seq(body, Alphabet.single_letter_alphabet), 1.167 + id=name, 1.168 + description=description 1.169 + ) 1.170 + for body, name, description in sequences 1.171 + ]) 1.172 + AlignIO.write(aln, self.file, self.format) 1.173 + 1.174 + def read_strings(self): 1.175 + """Read sequences from file.""" 1.176 + for seq in AlignIO.read(self.file, self.format): 1.177 + yield seq.id, seq.description, str(seq.seq) 1.178 + 1.179 class EmbossFile(AlignmentFile): 1.180 """Parser & writer for file formats supported by EMBOSS.""" 1.181