Tuesday, June 16, 2009

Motif Discovery 3

I realize that reading other people's code is [fill in the blank with your favorite expletive] impossible, and mind-numbing as well. It's part of the reason I find it difficult to enjoy books like Jones & Pevzner and Kinser's book, Python for Bioinformatics.

But we need to have some code to examine if we want to talk about algorithms. So, if you would like to follow the discussion that's coming later, please paste the code below into a textfile on your desktop (my copy is named gibbs.py but you can do whatever you want). You will also need the code from here, saved as motif.py, and the code from here, saved as scoring.py. You can run it in the usual way by doing "python gibbs.py."

Here is the code, and output for the particular settings. Discussion coming soon.

# Lawrence et al. Gibbs Sampler algorithm
# version 0.1
# the alignment is stored as a list of
# the motif's start point in each sequence

import random,sys,time
from motif import getTestSet
from scoring import hammingScore
#----------------------------------------------
def showStartSeqs(sL):
for i,m,seq in sL:
print str(i).rjust(5),m,' ',
print seq[:50],
if len(seq) > 50: print '...'
else: print
#----------------------------------------------
# aL holds positions of the motifs in seqs in L
# assign elements in aL randomly to start
def init_aL(D):
N = D['N']
SZ = D['SZ']
M = D['M']
aL = list()
for i in range(N):
aL.append(random.choice(range(SZ-M)))
return aL
#----------------------------------------------
# utility method
def showMotifAlignment(L,D,aL):
M = D['M']
retL = list()
for i in range(len(aL)):
j = aL[i]
retL.append(L[i][j:j+M])
return '\n'.join(retL)
#----------------------------------------------
# given a list of sequences,
# alignment start points
# index of one sequence to leave out
# and motif length

def getMotifSeqs(L,M,aL,currSeq):
retL = list()
for i,seq in enumerate(L):
if i == currSeq: continue
j = aL[i]
retL.append(seq[j:j+M])
return retL
#----------------------------------------------
# the sampler:
# consider one seq at a time
# slide to find the best score v. current motif
# repeat R times
def slideSeqs(L,aL,D,v=False):
N = D['N']
M = D['M']
SZ = D['SZ']
R = D['R']
currSeq = 0
for j in range(R):
if v: print aL
sL = getMotifSeqs(L,M,aL,currSeq)

# for currSeq find best match to alignment
maxI = 0
maxS = 0
for i in range(SZ-M):
seq = L[currSeq][i:i+M]
score = hammingScore(sL + [seq])
if score > maxS:
maxS = score
maxI = i
aL[currSeq] = maxI

if v: print maxS
#for e in sL: print e
if v: print 'currSeq', currSeq,
if v: print 'best', maxI
if v: print 'aL:', aL
#print L[currSeq][maxI:maxI+M]

currSeq = random.choice(range(N))
if v: print '-' * 30
return currSeq,aL,maxS
#----------------------------------------------
def main(D):
print 'numSeq ='.rjust(15),
print str(D['N']).rjust(4)
print 'motif length ='.rjust(15),
print str(D['M']).rjust(4)
print 'seq length ='.rjust(15),
print str(D['SZ']).rjust(4)
print 'slide cycles ='.rjust(15),
print str(D['R']).rjust(4)
print '# runs ='.rjust(15),
print str(D['T']).rjust(4)

# returns i,m,seq
# N = D['N']
startL = getTestSet(D)
# save the actual sequences separately in L
L = [t[2] for t in startL]

# r for results, L for list
rL = list()
# run sliding test from independent starts

print '\nruns:'
for i in range(T):
# display something to show progress
if not i: print '.',
if i:
if not (i+1)%10:
print str(i+1).rjust(3)
else: print '.',
aL = init_aL(D)
# do the work here
rL.append(slideSeqs(L,aL,D))
print '\n\n'

# extract all slider end scores to a new list
sL = [t[2] for t in rL]
# scan for final result with best score
maxI = -1
maxS = 0
for i,score in enumerate(sL):
if i and not i % 8: print
print score,
if score > maxS:
maxS = score
maxI = i

currSeq,aL,score = rL[maxI]
print '\nbest score =',
print score
n = sL.count(score)
print 'recovered',n,
if n > 1: print 'times\n\n'
else: print 'time\n\n'

print 'motif found:'
print aL
print showMotifAlignment(L,D,aL)

# display original motif
print '\n\noriginal:'
print hammingScore([t[1] for t in startL])
showStartSeqs(startL)
print

if __name__ == "__main__":
random.seed(157)
start = time.time()
N = 10 # number of sequences
M = 10 # motif length
SZ = 50 # sequence length
R = 25 # cycles of basic sliding
T = 100 # number of independent starts
D = {'N':N,'M':M,'SZ':SZ,'R':R,'T':T }
main(D)
t = time.time() - start
print 'elapsed time =', round(t,2), 'sec'

Output:


       numSeq =   10
motif length = 10
seq length = 50
slide cycles = 25
# runs = 100

runs:
. . . . . . . . . 10
. . . . . . . . . 20
. . . . . . . . . 30
. . . . . . . . . 40
. . . . . . . . . 50
. . . . . . . . . 60
. . . . . . . . . 70
. . . . . . . . . 80
. . . . . . . . . 90
. . . . . . . . . 100



235 227 212 257 254 229 212 220
206 226 228 231 191 208 212 236
198 201 242 211 202 210 171 216
218 209 237 206 201 180 209 206
212 256 239 257 207 207 242 258
202 252 210 192 203 249 199 197
192 211 238 218 221 195 259 222
225 265 250 222 214 214 218 191
215 208 215 249 200 227 211 239
233 207 204 218 196 215 218 196
228 205 221 221 213 207 201 208
239 232 252 245 206 252 214 213
192 213 204 210
best score = 265
recovered 1 time


motif found:
[35, 17, 24, 32, 18, 5, 33, 24, 18, 36]
GCACATAGCA
GAATTTGAAA
GAAAACACCT
GAACATAAGA
GAAAATAAAA
GATCATAATA
GGACATGCCA
GAAAATTCCA
GAAGTTAACT
GAATATAAGA


original:
252
35 GCACATAGCA TCGAAGACAAACTGCCGGCCTGGGTCCCCGAACGAGCACATAGCAATCAT
17 GAATTTGAAA TGCGACTGGGTGTTTCCGAATTTGAAATTCTCCACGCTATCGACTCTCAT
19 GTGCCGAAAA AATTGGCTGAGTGAATCGAGTGCCGAAAACACCTAGAATTATAGGGTACA
32 GAACATAAGA GATGGGCTCAACGTGCCGGGGATGATGGCACGGAACATAAGAGAGCGGAG
18 GAAAATAAAA ACGCAACGTACAATCACTGAAAATAAAACAATCCCGCTAGACAACTTCCG
5 GATCATAATA TGTATGATCATAATAAACGTACTCGCTCAACTAACGGTTACTGCTTAAAG
33 GGACATGCCA TAGTAAGCATAATTGTTACCTGGTGAAATCTGCGGACATGCCAGCGCTGA
39 GACCTTAAAC AGCAGGACACTCAGGCGAATATTCGAAAATTCCAAACCGGACCTTAAACT
18 GAAGTTAACT ATTTGATGTACTGGTGTAGAAGTTAACTGAAAGACGGTATAGAATGTATC
36 GAATATAAGA CTGACCCCATTTGTACTGCCCCCAAGGTACCAGTAGGAATATAAGAGTCC

elapsed time = 17.05 sec