Sunday, August 10, 2008

BioPython for GenBank

I was up in the dusty attic of my computer yesterday, looking through old code, and I found an example using BioPython to get the CDS entries from a GenBank record. It uses a Cocoa API to do the save, but that's all. It's probably preferable to the hand-rolled code from the last post.

from Bio import GenBank
from Foundation import *
from string import maketrans, upper


FH = open('sequences.gb', 'r')
parser = GenBank.FeatureParser()
it = GenBank.Iterator(FH, parser)
results = list()

def processGene(f):
D = dict()
D['start'] = int(f.location._start.position)
D['end'] = int(f.location._end.position)
D['strand'] = int(f.strand)
D['bid'] = f.qualifiers['locus_tag'][0]
D['name'] = f.qualifiers['gene'][0]
return D

record = it.next()
L = record.features
L = [f for f in L if f.type == 'CDS']
L = [processGene(f) for f in L]
sequence = record.seq.tostring()
print len(L)

def reversesequence(s):
t = maketrans('agctAGCT','tcgaTCGA')
s2 = s.translate(t)
L = list(s2)
L.reverse()
return ''.join(L)


def getsequence(D):
global sequence
seq = sequence[D['start']:D['end']]
if D['strand'] == -1:
seq = reversesequence(seq)
D['sequence'] = seq
return D

L = [getsequence(D) for D in L]

filename = NSHomeDirectory()
filename += '/Desktop/ECgenes.plist'
ma = NSArray.arrayWithArray_(L)
ma.writeToFile_atomically_(filename, True)