Документ взят из кэша поисковой машины. Адрес оригинального документа : http://kodomo.fbb.msu.ru/hg/allpy/raw-rev/f8bd7c469fcf
Дата изменения: Unknown
Дата индексирования: Tue Oct 2 07:35:39 2012
Кодировка:

# HG changeset patch
# User Daniil Alexeyevsky
# Date 1292451126 -10800
# Node ID f8bd7c469fcfce0a58798099e06cbfef3ff028b2
# Parent cf6cdc3b7ec508a918e2ecf50ae3de4d2d1bc2b2
Clean reimplementation of allpy.base.Alignment.from_fasta

Also changed interface of fasta.parse_file

Added allpy.util with currently one function unzip -- the reverse of zip
builtin

diff -r cf6cdc3b7ec5 -r f8bd7c469fcf allpy/base.py
--- a/allpy/base.py Wed Dec 15 23:45:11 2010 +0300
+++ b/allpy/base.py Thu Dec 16 01:12:06 2010 +0300
@@ -181,41 +181,47 @@
"""
sequences = fasta.parse_file(file)
assert len(sequences) == 1
- header = sequences.keys()[0]
- name, _, description = header.partition(" ")
+ name, description = sequences.keys()[0]
return cls(sequences[header], name, description, file.name)

-class Alignment(dict):
+class Alignment(list):
"""Alignment.

Behaves like a list of Columns.
"""
- # _sequences -- list of Sequence objects. Sequences don't contain gaps
- # - see sequence.py module

- def __init__(self, *args):
- """overloaded constructor
+ sequence_type = Sequence
+ """Type of sequences to create in alignment.
+
+ SHOULD be redefined when subclassing"""

- Alignment()
- new empty Alignment
+ def __init__(self):
+ """Initialize empty alignment."""
+ super(Alignment, self).__init__()

- Alignment(sequences, body)
- new Alignment with sequences and body initialized from arguments
+ self.sequences = []
+ """Ordered list of sequences in alignment."""

- Alignment(fasta_file)
- new Alignment, read body and sequences from fasta file
- """
- if len(args)>1:#overloaded constructor
- self.sequences=args[0]
- self.body=args[1]
- elif len(args)==0:
- self.sequences=[]
- self.body={}
- else:
- self.sequences, self.body = Alignment.from_fasta(args[0])
+ def add_gapped_line(self, line, name='', description='', source=''):
+ """Add row from a line of one-letter codes and gaps."""
+ Sequence = cls.sequence_type
+ not_gap = lambda (i, char): char != "-"
+ no_gaps = line.replace("-", "")
+ sequence = Sequence(no_gaps, name, description, source)
+ for i, (j, char) in enumerate(filter(not_gap, enumerate(line))):
+ self[j][seq] = sequence[i]
+ self.sequences.append(sequence)
+
+ @classmethod
+ def from_fasta(cls, file):
+ """Create new alignment from FASTA file."""
+ self = cls()
+ for ((name, description), body) in fasta.parse_file(file):
+ self.add_gapped_line(body, name, description)
+ return self

def length(self):
- """ Returns width, ie length of each sequence with gaps """
+ """Return width, ie length of each sequence with gaps."""
return max([len(line) for line in self.body.values()])

def height(self):
@@ -261,58 +267,6 @@
line.append(all_columns[position].get(aa))
return self.identity_percentages

- @classmethod
- def from_fasta(file):
- """ Import data from fasta file
-
- >>> import alignment
- >>> sequences,body=alignment.Alignment.from_fasta(open("test.fasta"))
- """
- import re
-
- sequences = []
- body = {}
-
- raw_sequences = file.read().split(">")
- if len(raw_sequences) <= 1:
- raise Exception("Wrong format of fasta-file %s" % file.name)
-
- raw_sequences = raw_sequences[1:] #ignore everything before the first >
- for raw in raw_sequences:
- parsed_raw_sequence = raw.split("\n")
- parsed_raw_sequence = [s.strip() for s in parsed_raw_sequence]
- name_and_description = parsed_raw_sequence[0]
- name_and_description = name_and_description.split(" ",1)
- if len(name_and_description) == 2:
- name, description = name_and_description
- elif len(name_and_description) == 1:
- #if there is description
- name = name_and_description[0]
- description = ''
- else:
- raise Exception("Wrong name of sequence %(name)$ fasta-file %(file)s" % \
- {'name': name, 'file': file.name})
-
- if len(parsed_raw_sequence) <= 1:
- raise Exception("Wrong format of sequence %(name)$ fasta-file %(file)s" % \
- {'name': name, 'file': file.name})
- string = ""
- for piece in parsed_raw_sequence[1:]:
- piece_without_whitespace_chars = re.sub("\s", "", piece)
- string += piece_without_whitespace_chars
- monomers = [] #convert into Monomer objects
- body_list = [] #create the respective list in body dict
- for current_monomer in string:
- if current_monomer not in ["-", ".", "~"]:
- monomers.append(cls.monomer_type.from_code1(current_monomer))
- body_list.append(monomers[-1])
- else:
- body_list.append(None)
- s = sequence.Sequence(monomers, name, description)
- sequences.append(s)
- body[s] = body_list
- return sequences, body
-
@staticmethod
def from_sequences(*sequences):
""" Constructs new alignment from sequences
diff -r cf6cdc3b7ec5 -r f8bd7c469fcf allpy/fasta.py
--- a/allpy/fasta.py Wed Dec 15 23:45:11 2010 +0300
+++ b/allpy/fasta.py Thu Dec 16 01:12:06 2010 +0300
@@ -1,14 +1,17 @@
def parse_fasta(file):
"""Parse fasta file, remove spaces and newlines from sequence bodies.

- Return a dict of { sequence header: sequence body }.
+ Return a dict of { (name, description) : sequence_body }.
"""
sequences = {}
for part in file.read().split(">"):
header, _, body = part.partition("\n")
header = header.lstrip(">").strip()
+ name, _, description = header.partition(" ")
+ name = name.strip()
+ description = description.strip()
body = body.replace(" ", "").replace("\n", "")
- sequences[header] = body
+ sequences[name, description] = body
return sequences

def save_fasta(out_file, string, name, description='', long_line=70):
diff -r cf6cdc3b7ec5 -r f8bd7c469fcf allpy/util.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/allpy/util.py Thu Dec 16 01:12:06 2010 +0300
@@ -0,0 +1,11 @@
+"""Miscellanous utilities.
+"""
+
+def unzip(seq):
+ a, b = [], []
+ for x, y in seq:
+ a.append(x)
+ b.append(y)
+ return a, b
+
+# vim: set et ts=4 sts=4 sw=4: