allpy
changeset 770:c3532d119351
Automated merge with ssh://kodomo/allpy
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Tue, 12 Jul 2011 17:12:05 +0400 |
parents | 5f0c15a3cbca a90a3c659e69 |
children | 4ed26f8afeac 0a4572dd44a4 |
files | |
diffstat | 4 files changed, 59 insertions(+), 20 deletions(-) [+] |
line diff
1.1 --- a/allpy/fileio.py Tue Jul 12 16:05:28 2011 +0400 1.2 +++ b/allpy/fileio.py Tue Jul 12 17:12:05 2011 +0400 1.3 @@ -67,8 +67,24 @@ 1.4 for string, name, description in sequences: 1.5 self.write_string(string, name, description) 1.6 1.7 + def read_parts(self): 1.8 + """Read parts beginning with > in FASTA file. 1.9 + 1.10 + This is a drop-in replacement for self.file.read().split("\n>") 1.11 + It is required for markup format, which combines parts read with 1.12 + different parsers. Python prohibits combining iterators and file.read 1.13 + methods on the same file. 1.14 + """ 1.15 + part = None 1.16 + for line in self.file: 1.17 + if line.startswith(">"): 1.18 + if part: yield part 1.19 + part = "" 1.20 + part += line 1.21 + if part: yield part 1.22 + 1.23 def read_strings(self): 1.24 - for part in self.file.read().split("\n>"): 1.25 + for part in self.read_parts(): 1.26 header, _, body = part.partition("\n") 1.27 header = header.lstrip(">") 1.28 name, _, description = header.partition(" ") 1.29 @@ -130,7 +146,7 @@ 1.30 record = {'type': 'alignment', 'format': self.format} 1.31 self.write_record(record) 1.32 self.write_empty_line() 1.33 - alignment.to_file(self.file) 1.34 + alignment.to_file(self.file, format=self.format, gap=self.gaps) 1.35 1.36 def write_markups(self, markups, type, pre_record={}): 1.37 """Write a dictionary of markups as series of records.""" 1.38 @@ -206,7 +222,7 @@ 1.39 def read_payload(self, alignment, record, type): 1.40 """Read record payload, if necessary.""" 1.41 if type == 'alignment': 1.42 - io = File(self.file, record.get('format', 'fasta')) 1.43 + io = File(self.file, record.get('format', 'fasta'), gaps=self.gaps) 1.44 io.read_alignment(alignment) 1.45 1.46 class EmbossFile(AlignmentFile):
2.1 --- a/allpy/processors.py Tue Jul 12 16:05:28 2011 +0400 2.2 +++ b/allpy/processors.py Tue Jul 12 17:12:05 2011 +0400 2.3 @@ -36,14 +36,17 @@ 2.4 # 2.5 2.6 class _Muscle(ExternalCommand): 2.7 - """Realign block with muscle.""" 2.8 + """Realign block with muscle. 2.9 + 2.10 + Arguments: 2.11 + 2.12 + - remove_gaps -- drop all gaps from alignment before realigning 2.13 + """ 2.14 2.15 def __init__(self, remove_gaps=False): 2.16 - """ 2.17 - remove_gaps -- drop all gaps from alignment before treating it with muscle 2.18 - """ 2.19 self.remove_gaps = remove_gaps 2.20 - ExternalCommand.__init__(self, 'muscle -stable -in %(infile)s -out %(outfile)s') 2.21 + cmd = 'muscle -stable -in %(infile)s -out %(outfile)s >/dev/null 2>&1' 2.22 + ExternalCommand.__init__(self, cmd) 2.23 2.24 def __call__(self, block): 2.25 if self.remove_gaps:
3.1 --- a/test/test_markups.py Tue Jul 12 16:05:28 2011 +0400 3.2 +++ b/test/test_markups.py Tue Jul 12 17:12:05 2011 +0400 3.3 @@ -69,15 +69,21 @@ 3.4 3.5 file = StringIO() 3.6 aln.to_file(file, format='markup') 3.7 - import sys 3.8 + 3.9 file.seek(0) 3.10 - 3.11 out = protein.Alignment().append_file(file, format='markup') 3.12 s = out.sequences[0] 3.13 c = out.columns 3.14 - print out.markups 3.15 - print s.markups 3.16 - print out.sequences[1].markups 3.17 + assert s[5].m1 == 5 and s[3].m1 == 3 3.18 + assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6 3.19 + 3.20 + file.seek(0) 3.21 + file.write(file.getvalue().replace('Q-V', 'Q!V').replace('E-Q', 'E+Q')) 3.22 + assert '!' in file.getvalue() and '+' in file.getvalue() 3.23 + file.seek(0) 3.24 + out = protein.Alignment().append_file(file, format='markup', gaps='-+!') 3.25 + s = out.sequences[0] 3.26 + c = out.columns 3.27 assert s[5].m1 == 5 and s[3].m1 == 3 3.28 assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6 3.29
4.1 --- a/utils/extract_pfam.py Tue Jul 12 16:05:28 2011 +0400 4.2 +++ b/utils/extract_pfam.py Tue Jul 12 17:12:05 2011 +0400 4.3 @@ -13,7 +13,9 @@ 4.4 options.min_depth = 2 4.5 options.min_width = 20 4.6 options.infile_name = '/srv/databases/pfam/Pfam-A.full.gz' 4.7 -options.outdir_name = '/srv/databases/pfam/test' 4.8 +options.outdir_name = '/srv/databases/pfam/Pfam-with-PDB' 4.9 + 4.10 +download_pdb = structure.cached_download_pdb 4.11 4.12 types = Struct() 4.13 class Monomer(protein.Monomer): 4.14 @@ -43,12 +45,12 @@ 4.15 def add_pdb(self): 4.16 match = structure.pdb_id_parse(self.sequence.name) 4.17 code, model , chain = match['code'], match['model'], match['chain'] 4.18 - pdb_file = structure.download_pdb(code) 4.19 + pdb_file = download_pdb(code) 4.20 pdb_structure = structure.get_structure(pdb_file, self.sequence.name) 4.21 - self.sequence.pdb_chain = pdb_chain = pdb_structure[0][pdb_chain] 4.22 + self.sequence.pdb_chain = pdb_chain = pdb_structure[0][chain] 4.23 for monomer in self.sequence: 4.24 if monomer in self: 4.25 - monomer.pdb_residue = pdb_chain[' ', self.pdb_resi, ' '] 4.26 + monomer.pdb_residue = pdb_chain[' ', monomer.pdb_resi, ' '] 4.27 4.28 re_range = re.compile(r'(-?\d+)-(-?\d+)') 4.29 def parse_dbxref(dbxref): 4.30 @@ -56,6 +58,7 @@ 4.31 assert db == "PDB", "DBxref not for PDB" 4.32 assert null.strip() == "", "Too many fields in GF DB PDB" 4.33 result = Struct() 4.34 + assert len(pdb_chain.split()) == 2 4.35 result.pdb_id, result.chain = pdb_chain.split() 4.36 range = re_range.match(range) 4.37 result.begin = int(range.group(1)) 4.38 @@ -83,8 +86,7 @@ 4.39 seq.__class__ = Sequence 4.40 begin, end = (' ', code.begin, ' '), (' ', code.end, ' ') 4.41 try: 4.42 - seq.auto_pdb(resi_begin=begin, resi_end=end, 4.43 - pdb_getter=structure.cached_download_pdb) 4.44 + seq.auto_pdb(resi_begin=begin, resi_end=end, pdb_getter=download_pdb) 4.45 except AssertionError: 4.46 raise 4.47 except Exception: 4.48 @@ -111,9 +113,21 @@ 4.49 for seq in aln.sequences: 4.50 remove_trailing_monomers(seq, aln.columns) 4.51 remove_trailing_monomers(seq, reversed(aln.columns)) 4.52 - for monomer in seq: 4.53 + 4.54 + # XXX [ This is extremely nasty hack ] XXX 4.55 + # We replace monomer's class with Gap, so that it is written as . 4.56 + # We remove the monomer from the sequence, BUT NOT FROM ALIGNMENT! 4.57 + # And we add some junk to monomers (SequenceIndexMarkup), 4.58 + # which we keep laying around there, but remove from markups, so 4.59 + # that is does not go to file 4.60 + # In the end it works like we want, but 4.61 + # XXX [ KIDS, DON'T TRY TO DO THIS AT HOME! ] XXX 4.62 + markups.SequenceIndexMarkup(seq) 4.63 + for monomer in reversed(seq): 4.64 if not hasattr(monomer, 'pdb_residue'): 4.65 monomer.__class__ = Gap 4.66 + del seq[monomer.index] 4.67 + del seq.markups['index'] 4.68 4.69 def clean_aln(aln): 4.70 aln.sequences = [