allpy

changeset 770:c3532d119351
Automated merge with ssh://kodomo/allpy
author: Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date: Tue, 12 Jul 2011 17:12:05 +0400
parents: 5f0c15a3cbca a90a3c659e69
children: 4ed26f8afeac 0a4572dd44a4
diffstat: 4 files changed, 59 insertions(+), 20 deletions(-) [+]
[-]

allpy/fileio.py 22

allpy/processors.py 13

test/test_markups.py 16

utils/extract_pfam.py 28 allpy/fileio.py 22 allpy/processors.py 13 test/test_markups.py 16 utils/extract_pfam.py 28
allpy/fileio.py 22
allpy/processors.py 13
test/test_markups.py 16
     1.1 --- a/allpy/fileio.py	Tue Jul 12 16:05:28 2011 +0400
     1.2 +++ b/allpy/fileio.py	Tue Jul 12 17:12:05 2011 +0400
     1.3 @@ -67,8 +67,24 @@
     1.4          for string, name, description in sequences:
     1.5              self.write_string(string, name, description)
     1.6  
     1.7 +    def read_parts(self):
     1.8 +        """Read parts beginning with > in FASTA file.
     1.9 +
    1.10 +        This is a drop-in replacement for self.file.read().split("\n>")
    1.11 +        It is required for markup format, which combines parts read with
    1.12 +        different parsers. Python prohibits combining iterators and file.read
    1.13 +        methods on the same file.
    1.14 +        """
    1.15 +        part = None
    1.16 +        for line in self.file:
    1.17 +            if line.startswith(">"):
    1.18 +                if part: yield part
    1.19 +                part = ""
    1.20 +            part += line
    1.21 +        if part: yield part
    1.22 +
    1.23      def read_strings(self):
    1.24 -        for part in self.file.read().split("\n>"):
    1.25 +        for part in self.read_parts():
    1.26              header, _, body = part.partition("\n")
    1.27              header = header.lstrip(">")
    1.28              name, _, description = header.partition(" ")
    1.29 @@ -130,7 +146,7 @@
    1.30          record = {'type': 'alignment', 'format': self.format}
    1.31          self.write_record(record)
    1.32          self.write_empty_line()
    1.33 -        alignment.to_file(self.file)
    1.34 +        alignment.to_file(self.file, format=self.format, gap=self.gaps)
    1.35  
    1.36      def write_markups(self, markups, type, pre_record={}):
    1.37          """Write a dictionary of markups as series of records."""
    1.38 @@ -206,7 +222,7 @@
    1.39      def read_payload(self, alignment, record, type):
    1.40          """Read record payload, if necessary."""
    1.41          if type == 'alignment':
    1.42 -            io = File(self.file, record.get('format', 'fasta'))
    1.43 +            io = File(self.file, record.get('format', 'fasta'), gaps=self.gaps)
    1.44              io.read_alignment(alignment)
    1.45  
    1.46  class EmbossFile(AlignmentFile):

     2.1 --- a/allpy/processors.py	Tue Jul 12 16:05:28 2011 +0400
     2.2 +++ b/allpy/processors.py	Tue Jul 12 17:12:05 2011 +0400
     2.3 @@ -36,14 +36,17 @@
     2.4  #
     2.5  
     2.6  class _Muscle(ExternalCommand):
     2.7 -    """Realign block with muscle."""
     2.8 +    """Realign block with muscle.
     2.9 +
    2.10 +    Arguments:
    2.11 +
    2.12 +    - remove_gaps -- drop all gaps from alignment before realigning
    2.13 +    """
    2.14  
    2.15      def __init__(self, remove_gaps=False):
    2.16 -        """
    2.17 -        remove_gaps -- drop all gaps from alignment before treating it with muscle
    2.18 -        """
    2.19          self.remove_gaps = remove_gaps
    2.20 -        ExternalCommand.__init__(self, 'muscle -stable -in %(infile)s -out %(outfile)s')
    2.21 +        cmd = 'muscle -stable -in %(infile)s -out %(outfile)s >/dev/null 2>&1'
    2.22 +        ExternalCommand.__init__(self, cmd)
    2.23  
    2.24      def __call__(self, block):
    2.25          if self.remove_gaps:

     3.1 --- a/test/test_markups.py	Tue Jul 12 16:05:28 2011 +0400
     3.2 +++ b/test/test_markups.py	Tue Jul 12 17:12:05 2011 +0400
     3.3 @@ -69,15 +69,21 @@
     3.4  
     3.5      file = StringIO()
     3.6      aln.to_file(file, format='markup')
     3.7 -    import sys
     3.8 +
     3.9      file.seek(0)
    3.10 -
    3.11      out = protein.Alignment().append_file(file, format='markup')
    3.12      s = out.sequences[0]
    3.13      c = out.columns
    3.14 -    print out.markups
    3.15 -    print s.markups
    3.16 -    print out.sequences[1].markups
    3.17 +    assert s[5].m1 == 5 and s[3].m1 == 3
    3.18 +    assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6
    3.19 +
    3.20 +    file.seek(0)
    3.21 +    file.write(file.getvalue().replace('Q-V', 'Q!V').replace('E-Q', 'E+Q'))
    3.22 +    assert '!' in file.getvalue() and '+' in file.getvalue()
    3.23 +    file.seek(0)
    3.24 +    out = protein.Alignment().append_file(file, format='markup', gaps='-+!')
    3.25 +    s = out.sequences[0]
    3.26 +    c = out.columns
    3.27      assert s[5].m1 == 5 and s[3].m1 == 3
    3.28      assert out.markups['m2'][c[5]] == 5 and out.markups['m2'][c[6]] == 6
    3.29  

     4.1 --- a/utils/extract_pfam.py	Tue Jul 12 16:05:28 2011 +0400
     4.2 +++ b/utils/extract_pfam.py	Tue Jul 12 17:12:05 2011 +0400
     4.3 @@ -13,7 +13,9 @@
     4.4  options.min_depth = 2
     4.5  options.min_width = 20
     4.6  options.infile_name = '/srv/databases/pfam/Pfam-A.full.gz'
     4.7 -options.outdir_name = '/srv/databases/pfam/test'
     4.8 +options.outdir_name = '/srv/databases/pfam/Pfam-with-PDB'
     4.9 +
    4.10 +download_pdb = structure.cached_download_pdb
    4.11  
    4.12  types = Struct()
    4.13  class Monomer(protein.Monomer):
    4.14 @@ -43,12 +45,12 @@
    4.15      def add_pdb(self):
    4.16          match = structure.pdb_id_parse(self.sequence.name)
    4.17          code, model , chain = match['code'], match['model'], match['chain']
    4.18 -        pdb_file = structure.download_pdb(code)
    4.19 +        pdb_file = download_pdb(code)
    4.20          pdb_structure = structure.get_structure(pdb_file, self.sequence.name)
    4.21 -        self.sequence.pdb_chain = pdb_chain = pdb_structure[0][pdb_chain]
    4.22 +        self.sequence.pdb_chain = pdb_chain = pdb_structure[0][chain]
    4.23          for monomer in self.sequence:
    4.24              if monomer in self:
    4.25 -                monomer.pdb_residue = pdb_chain[' ', self.pdb_resi, ' ']
    4.26 +                monomer.pdb_residue = pdb_chain[' ', monomer.pdb_resi, ' ']
    4.27  
    4.28  re_range = re.compile(r'(-?\d+)-(-?\d+)')
    4.29  def parse_dbxref(dbxref):
    4.30 @@ -56,6 +58,7 @@
    4.31      assert db == "PDB", "DBxref not for PDB"
    4.32      assert null.strip() == "", "Too many fields in GF DB PDB"
    4.33      result = Struct()
    4.34 +    assert len(pdb_chain.split()) == 2
    4.35      result.pdb_id, result.chain = pdb_chain.split()
    4.36      range = re_range.match(range)
    4.37      result.begin = int(range.group(1))
    4.38 @@ -83,8 +86,7 @@
    4.39      seq.__class__ = Sequence
    4.40      begin, end = (' ', code.begin, ' '), (' ', code.end, ' ')
    4.41      try:
    4.42 -        seq.auto_pdb(resi_begin=begin, resi_end=end,
    4.43 -            pdb_getter=structure.cached_download_pdb)
    4.44 +        seq.auto_pdb(resi_begin=begin, resi_end=end, pdb_getter=download_pdb)
    4.45      except AssertionError:
    4.46          raise
    4.47      except Exception:
    4.48 @@ -111,9 +113,21 @@
    4.49      for seq in aln.sequences:
    4.50          remove_trailing_monomers(seq, aln.columns)
    4.51          remove_trailing_monomers(seq, reversed(aln.columns))
    4.52 -        for monomer in seq:
    4.53 +
    4.54 +        # XXX [ This is extremely nasty hack ] XXX
    4.55 +        # We replace monomer's class with Gap, so that it is written as .
    4.56 +        # We remove the monomer from the sequence, BUT NOT FROM ALIGNMENT!
    4.57 +        # And we add some junk to monomers (SequenceIndexMarkup),
    4.58 +        # which we keep laying around there, but remove from markups, so
    4.59 +        # that is does not go to file
    4.60 +        # In the end it works like we want, but
    4.61 +        # XXX [ KIDS, DON'T TRY TO DO THIS AT HOME! ] XXX
    4.62 +        markups.SequenceIndexMarkup(seq)
    4.63 +        for monomer in reversed(seq):
    4.64              if not hasattr(monomer, 'pdb_residue'):
    4.65                  monomer.__class__ = Gap
    4.66 +                del seq[monomer.index]
    4.67 +        del seq.markups['index']
    4.68  
    4.69  def clean_aln(aln):
    4.70      aln.sequences = [