allpy

changeset 1100:c4c772e3ce86
Added to dna.Sequence method `translate` to translate into protein sequence using given genetic code
author: Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru>
date: Sat, 09 Jun 2012 19:15:44 +0400
parents: d9872ba42f15
children: 41a167bbf150
files: allpy/data/genetic_code.py allpy/dna.py test/test_dna.py test/test_realign.py
diffstat: 4 files changed, 92 insertions(+), 1 deletions(-) [+]
[-]

allpy/data/genetic_code.py 40

allpy/dna.py 41

test/test_dna.py 11

test/test_realign.py 1 allpy/data/genetic_code.py 40 allpy/dna.py 41 test/test_dna.py 11 test/test_realign.py 1
allpy/data/genetic_code.py 40
allpy/dna.py 41
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/allpy/data/genetic_code.py	Sat Jun 09 19:15:44 2012 +0400
     1.3 @@ -0,0 +1,40 @@
     1.4 +"""Standard genetic code. (DNA/RNA -> protein translation tables).
     1.5 +
     1.6 +Genetic code is presented as dictionary mappting three one-letter nucleotide
     1.7 +codes to amino-acid name.
     1.8 +"""
     1.9 +
    1.10 +_genetic_code = {
    1.11 +    'Stop':          ('TAA', 'TAG', 'TGA'),
    1.12 +    'Alanine':       ('GCA', 'GCC', 'GCG', 'GCT'),
    1.13 +    'Cysteine':      ('TGC', 'TGT'),
    1.14 +    'Aspartic acid': ('GAC', 'GAT'),
    1.15 +    'Glutamic acid': ('GAA', 'GAG'),
    1.16 +    'Phenylalanine': ('TTC', 'TTT'),
    1.17 +    'Glycine':       ('GGA', 'GGC', 'GGG', 'GGT'),
    1.18 +    'Histidine':     ('CAC', 'CAT'),
    1.19 +    'Isoleucine':    ('ATA', 'ATC', 'ATT'),
    1.20 +    'Lysine':        ('AAA', 'AAG'),
    1.21 +    'Leucine':       ('CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'),
    1.22 +    'Methionine':    ('ATG',),
    1.23 +    'Asparagine':    ('AAC', 'AAT'),
    1.24 +    'Proline':       ('CCA', 'CCC', 'CCG', 'CCT'),
    1.25 +    'Glutamine':     ('CAA', 'CAG'),
    1.26 +    'Arginine':      ('AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'),
    1.27 +    'Serine':        ('AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'),
    1.28 +    'Threonine':     ('ACA', 'ACC', 'ACG', 'ACT'),
    1.29 +    'Valine':        ('GTA', 'GTC', 'GTG', 'GTT'),
    1.30 +    'Tryptophan':    ('TGG',),
    1.31 +    'Tyrosine':      ('TAC', 'TAT'),
    1.32 +}
    1.33 +
    1.34 +standard_dna_code = dict((triplet, aa)
    1.35 +  for aa, triplets in _genetic_code.items()
    1.36 +  for triplet in triplets
    1.37 +)
    1.38 +"""Genetic code table for direct DNA -> protein translation."""
    1.39 +
    1.40 +standard_rna_code = dict((triplet.replace("T", "U"), aa)
    1.41 +  for triplet, aa in standard_dna_code.items()
    1.42 +)
    1.43 +"""Standard genetic code table."""

     2.1 --- a/allpy/dna.py	Sun Jun 03 23:55:00 2012 +0400
     2.2 +++ b/allpy/dna.py	Sat Jun 09 19:15:44 2012 +0400
     2.3 @@ -1,7 +1,9 @@
     2.4  import base
     2.5  import data.codes
     2.6 +from data.genetic_code import standard_dna_code
     2.7  
     2.8  import dna
     2.9 +import protein
    2.10  
    2.11  class Monomer(base.Monomer):
    2.12      """DNA monomers: nucleotides."""
    2.13 @@ -30,6 +32,45 @@
    2.14              result.append_monomer(complement.get(monomer.code1, 'N'))
    2.15          return result
    2.16  
    2.17 +    def translated(self, code=None, name=None, description=None, source=None):
    2.18 +        """Return a new protein sequence translated from self.
    2.19 +
    2.20 +        `code` is a dict of triplet of dna `code1`s -> aminoacid `name`.
    2.21 +
    2.22 +        If `code` is not specified, the standard genetic code is used.
    2.23 +
    2.24 +        If `code` is specified, it may contain only the changed codons.
    2.25 +
    2.26 +        Class of proteins to use is `self.types.protein.Sequence`, you
    2.27 +        are free to replace it at will.
    2.28 +
    2.29 +        Return new protein sequence where:
    2.30 +
    2.31 +            * `name` is self.name with "_tr" appended
    2.32 +            * `description` is self.description with " translated" appended
    2.33 +            * `source` is the same as self.source
    2.34 +        """
    2.35 +        if code:
    2.36 +            code, modification = dict(standard_dna_code), code
    2.37 +            code.update(modification)
    2.38 +        else:
    2.39 +            code = standard_dna_code
    2.40 +        result = self.types.protein.Sequence([],
    2.41 +            name=name or self.name + "_tr",
    2.42 +            description=description or self.description + " translated",
    2.43 +            source=source or self.source
    2.44 +        )
    2.45 +        seen_stop = False
    2.46 +        for a, b, c in zip(self[::3], self[1::3], self[2::3]):
    2.47 +            assert not seen_stop, "Stop-codon must be the last one"
    2.48 +            triplet = a.code1 + b.code1 + c.code1
    2.49 +            aa_name = code[triplet]
    2.50 +            if aa_name == "Stop":
    2.51 +                seen_stop = True
    2.52 +                continue
    2.53 +            result.append_monomer(name=aa_name)
    2.54 +        return result
    2.55 +
    2.56  class Column(base.Column):
    2.57      types = dna
    2.58  

     3.1 --- a/test/test_dna.py	Sun Jun 03 23:55:00 2012 +0400
     3.2 +++ b/test/test_dna.py	Sat Jun 09 19:15:44 2012 +0400
     3.3 @@ -1,3 +1,4 @@
     3.4 +from nose.tools import raises
     3.5  from allpy import dna
     3.6  
     3.7  def test_dna():
     3.8 @@ -5,3 +6,13 @@
     3.9  	s1 = s.reverse_complemented()
    3.10  	assert s1.name == "seq1'"
    3.11  	assert str(s1) == "GCTCTTCCGATCT"
    3.12 +	assert str(s.translated()) == "RSEE"
    3.13 +
    3.14 +	del s[12:]
    3.15 +	s += dna.Sequence.from_string("TGA")
    3.16 +	assert str(s.translated()) == "RSEE"
    3.17 +
    3.18 +@raises(AssertionError)
    3.19 +def test_translation_stop():
    3.20 +	s = dna.Sequence.from_string("TGATCGGAAGAGC")
    3.21 +	print s.translated()

     4.1 --- a/test/test_realign.py	Sun Jun 03 23:55:00 2012 +0400
     4.2 +++ b/test/test_realign.py	Sat Jun 09 19:15:44 2012 +0400
     4.3 @@ -1,4 +1,3 @@
     4.4 -from nose.tools import raises
     4.5  from allpy import protein, processors
     4.6  
     4.7  example1 = (protein.Alignment().