allpy
changeset 1100:c4c772e3ce86
Added to dna.Sequence method `translate` to translate into protein sequence using given genetic code
author | Daniil Alexeyevsky <dendik@kodomo.fbb.msu.ru> |
---|---|
date | Sat, 09 Jun 2012 19:15:44 +0400 |
parents | d9872ba42f15 |
children | 41a167bbf150 |
files | allpy/data/genetic_code.py allpy/dna.py test/test_dna.py test/test_realign.py |
diffstat | 4 files changed, 92 insertions(+), 1 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/allpy/data/genetic_code.py Sat Jun 09 19:15:44 2012 +0400 1.3 @@ -0,0 +1,40 @@ 1.4 +"""Standard genetic code. (DNA/RNA -> protein translation tables). 1.5 + 1.6 +Genetic code is presented as dictionary mappting three one-letter nucleotide 1.7 +codes to amino-acid name. 1.8 +""" 1.9 + 1.10 +_genetic_code = { 1.11 + 'Stop': ('TAA', 'TAG', 'TGA'), 1.12 + 'Alanine': ('GCA', 'GCC', 'GCG', 'GCT'), 1.13 + 'Cysteine': ('TGC', 'TGT'), 1.14 + 'Aspartic acid': ('GAC', 'GAT'), 1.15 + 'Glutamic acid': ('GAA', 'GAG'), 1.16 + 'Phenylalanine': ('TTC', 'TTT'), 1.17 + 'Glycine': ('GGA', 'GGC', 'GGG', 'GGT'), 1.18 + 'Histidine': ('CAC', 'CAT'), 1.19 + 'Isoleucine': ('ATA', 'ATC', 'ATT'), 1.20 + 'Lysine': ('AAA', 'AAG'), 1.21 + 'Leucine': ('CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'), 1.22 + 'Methionine': ('ATG',), 1.23 + 'Asparagine': ('AAC', 'AAT'), 1.24 + 'Proline': ('CCA', 'CCC', 'CCG', 'CCT'), 1.25 + 'Glutamine': ('CAA', 'CAG'), 1.26 + 'Arginine': ('AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'), 1.27 + 'Serine': ('AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'), 1.28 + 'Threonine': ('ACA', 'ACC', 'ACG', 'ACT'), 1.29 + 'Valine': ('GTA', 'GTC', 'GTG', 'GTT'), 1.30 + 'Tryptophan': ('TGG',), 1.31 + 'Tyrosine': ('TAC', 'TAT'), 1.32 +} 1.33 + 1.34 +standard_dna_code = dict((triplet, aa) 1.35 + for aa, triplets in _genetic_code.items() 1.36 + for triplet in triplets 1.37 +) 1.38 +"""Genetic code table for direct DNA -> protein translation.""" 1.39 + 1.40 +standard_rna_code = dict((triplet.replace("T", "U"), aa) 1.41 + for triplet, aa in standard_dna_code.items() 1.42 +) 1.43 +"""Standard genetic code table."""
2.1 --- a/allpy/dna.py Sun Jun 03 23:55:00 2012 +0400 2.2 +++ b/allpy/dna.py Sat Jun 09 19:15:44 2012 +0400 2.3 @@ -1,7 +1,9 @@ 2.4 import base 2.5 import data.codes 2.6 +from data.genetic_code import standard_dna_code 2.7 2.8 import dna 2.9 +import protein 2.10 2.11 class Monomer(base.Monomer): 2.12 """DNA monomers: nucleotides.""" 2.13 @@ -30,6 +32,45 @@ 2.14 result.append_monomer(complement.get(monomer.code1, 'N')) 2.15 return result 2.16 2.17 + def translated(self, code=None, name=None, description=None, source=None): 2.18 + """Return a new protein sequence translated from self. 2.19 + 2.20 + `code` is a dict of triplet of dna `code1`s -> aminoacid `name`. 2.21 + 2.22 + If `code` is not specified, the standard genetic code is used. 2.23 + 2.24 + If `code` is specified, it may contain only the changed codons. 2.25 + 2.26 + Class of proteins to use is `self.types.protein.Sequence`, you 2.27 + are free to replace it at will. 2.28 + 2.29 + Return new protein sequence where: 2.30 + 2.31 + * `name` is self.name with "_tr" appended 2.32 + * `description` is self.description with " translated" appended 2.33 + * `source` is the same as self.source 2.34 + """ 2.35 + if code: 2.36 + code, modification = dict(standard_dna_code), code 2.37 + code.update(modification) 2.38 + else: 2.39 + code = standard_dna_code 2.40 + result = self.types.protein.Sequence([], 2.41 + name=name or self.name + "_tr", 2.42 + description=description or self.description + " translated", 2.43 + source=source or self.source 2.44 + ) 2.45 + seen_stop = False 2.46 + for a, b, c in zip(self[::3], self[1::3], self[2::3]): 2.47 + assert not seen_stop, "Stop-codon must be the last one" 2.48 + triplet = a.code1 + b.code1 + c.code1 2.49 + aa_name = code[triplet] 2.50 + if aa_name == "Stop": 2.51 + seen_stop = True 2.52 + continue 2.53 + result.append_monomer(name=aa_name) 2.54 + return result 2.55 + 2.56 class Column(base.Column): 2.57 types = dna 2.58
3.1 --- a/test/test_dna.py Sun Jun 03 23:55:00 2012 +0400 3.2 +++ b/test/test_dna.py Sat Jun 09 19:15:44 2012 +0400 3.3 @@ -1,3 +1,4 @@ 3.4 +from nose.tools import raises 3.5 from allpy import dna 3.6 3.7 def test_dna(): 3.8 @@ -5,3 +6,13 @@ 3.9 s1 = s.reverse_complemented() 3.10 assert s1.name == "seq1'" 3.11 assert str(s1) == "GCTCTTCCGATCT" 3.12 + assert str(s.translated()) == "RSEE" 3.13 + 3.14 + del s[12:] 3.15 + s += dna.Sequence.from_string("TGA") 3.16 + assert str(s.translated()) == "RSEE" 3.17 + 3.18 +@raises(AssertionError) 3.19 +def test_translation_stop(): 3.20 + s = dna.Sequence.from_string("TGATCGGAAGAGC") 3.21 + print s.translated()