Saturday, November 13, 2010

New plotter for phylogenetic trees: parsing

In July of last year, I had a series of posts about drawing phylogenetic trees (here, here, here, here and here). Some very early stuff is here and here.

I want to revisit this topic to try to get a tree plotter in python that is really flexible. The important thing to remember here is that we do not want to "roll our own" phylogenetics software. We should let the experts do that for us (and debug it!). Now that we know about PyCogent (web page, links in the sidebar), I say let them do the heavy lifting.

So I wrote a module today that starts with a Newick format phylogenetic tree (the one we used before here), and gets PyCogent to figure out all its connections. You can see how versatile the PyCogent code for dealing with trees is in the docs.

This is the tree:

((Stenotrophomonas_maltophilia:0.07574,
Kingella_oralis:0.08026)
:0.00827,
Pseudomonas_aeruginosa:0.05950,
((Salmonella_typhi:0.01297,
Escherichia_coli:0.01491)
:0.03356,
Haemophilus_parainfluenzae:0.06113)
:0.03863);


Anyway, here is the output from the module. It's not very well tested yet. So I hope you find a bug and let me know. But I'm not Donald Knuth so I'm not going to pay you :)

[UPDATE: I replaced the code with a later version]

output:

$ python tree_utils.py
making default tree

finding children
edge.0
['Stenotrophomonas_maltophilia', 'Kingella_oralis']
edge.1
['Salmonella_typhi', 'Escherichia_coli']
edge.2
['edge.1', 'Haemophilus_parainfluenzae']
root
['edge.0', 'Pseudomonas_aeruginosa', 'edge.2']

Stenotrophomonas_maltophilia
node 'Stenotrophomonas_maltophilia':0.07574;
ancestors ['edge.0', 'root']
name Stenotrophomonas_maltophilia
is_root False
y 0.0
x 0.08401
dist_to_parent 0.07574
is_external True

Kingella_oralis
node 'Kingella_oralis':0.08026;
ancestors ['edge.0', 'root']
name Kingella_oralis
is_root False
y 0.019952
x 0.08853
dist_to_parent 0.08026
is_external True

Pseudomonas_aeruginosa
node 'Pseudomonas_aeruginosa':0.0595;
ancestors ['root']
name Pseudomonas_aeruginosa
is_root False
y 0.039904
x 0.0595
dist_to_parent 0.0595
is_external True

Salmonella_typhi
node 'Salmonella_typhi':0.01297;
ancestors ['edge.1', 'edge.2', 'root']
name Salmonella_typhi
is_root False
y 0.059856
x 0.08516
dist_to_parent 0.01297
is_external True

Escherichia_coli
node 'Escherichia_coli':0.01491;
ancestors ['edge.1', 'edge.2', 'root']
name Escherichia_coli
is_root False
y 0.079808
x 0.0871
dist_to_parent 0.01491
is_external True

Haemophilus_parainfluenzae
node 'Haemophilus_parainfluenzae':0.06113;
ancestors ['edge.2', 'root']
name Haemophilus_parainfluenzae
is_root False
y 0.09976
x 0.09976
dist_to_parent 0.06113
is_external True

root
y_top 0.084796
ancestors []
name root
is_root True
immediate_children ['edge.0', 'Pseudomonas_aeruginosa', 'edge.2']
y_bott 0.009976
y 0.039904
x 0
dist_to_parent 0
is_external False

edge.0
y_top 0.019952
ancestors ['root']
name edge.0
is_root False
immediate_children ['Stenotrophomonas_maltophilia', 'Kingella_oralis']
y_bott 0.0
y 0.009976
x 0.00827
dist_to_parent 0.00827
is_external False

edge.2
y_top 0.09976
ancestors ['root']
name edge.2
is_root False
immediate_children ['edge.1', 'Haemophilus_parainfluenzae']
y_bott 0.069832
y 0.084796
x 0.03863
dist_to_parent 0.03863
is_external False

edge.1
y_top 0.079808
ancestors ['edge.2', 'root']
name edge.1
is_root False
immediate_children ['Salmonella_typhi', 'Escherichia_coli']
y_bott 0.059856
y 0.069832
x 0.07219
dist_to_parent 0.03356
is_external False

code listing for tree_utils.py:

import sys
from cogent import LoadTree

s = '''
((Stenotrophomonas_maltophilia:0.07574,
Kingella_oralis:0.08026)
:0.00827,
Pseudomonas_aeruginosa:0.05950,
((Salmonella_typhi:0.01297,
Escherichia_coli:0.01491)
:0.03356,
Haemophilus_parainfluenzae:0.06113)
:0.03863);'''

def load_data(fn):
FH = open(fn,'r')
data = FH.read().strip()
FH.close()
return data

# for flexibility, we can this function with
# a tree_string
# a filename
# or a PyCogent tree
def make_tree_dict(ts=None,fn=None,tr=None,debug=False):
if tr:
pass
elif fn:
tr = LoadTree(filename=fn)
else:
tr = LoadTree(treestring=ts)
nodes = tr.getNodesDict()

# and their names
all_node_names = tr.getNodeNames()
# PyCogent organizes the external node names
# in the correct order, so the rest is easy

# rearrange the data my way
D = dict()
D['meta'] = { 'all_node_names':all_node_names }
D['meta']['tree'] = tr
e_node_names = list()
i_node_names = list()

for name in all_node_names:
node = nodes[name]
external = node.isTip()
if external: e_node_names.append(name)
else: i_node_names.append(name)

# in order up the tree
aL = [a.Name for a in node.ancestors()]
dist_to_root = node.distance(nodes['root'])

nD = {'name':name,
'node':node,
'is_external':external,
'is_root': name == 'root',
'x':dist_to_root,
'ancestors':aL }

if not nD['is_root']:
dist = node.distance(nodes[aL[0]])
nD['dist_to_parent'] = dist
else:
nD['dist_to_parent'] = 0
D[name] = nD

D['meta']['e_node_names'] = e_node_names
D['meta']['i_node_names'] = i_node_names
D = compute_y_pos(D,debug=debug)
if debug: show(D)
return D

def compute_y_pos(D,debug=False):
all_node_names = D['meta']['all_node_names']
e_node_names = D['meta']['e_node_names']
i_node_names = D['meta']['i_node_names']

# actual node dictionaries
e_node_dicts = [D[n] for n in e_node_names]
i_node_dicts = [D[n] for n in i_node_names]

# just make a square plot
xmax = max([nD['x'] for nD in e_node_dicts])
ymax = xmax
D['meta']['max_xy'] = xmax, ymax

# compute y for external nodes
N = 1.0*(len(e_node_names)-1)
for nD in e_node_dicts:
i = e_node_names.index(nD['name'])
f = i/N
nD['y'] = f*ymax

def mean(L): return sum(L)*1.0/len(L)

def get_child_node_names(parent):
if debug: print parent
cL = list()
for name in all_node_names:
if name == 'root': continue
nD = D[name]
if parent == nD['ancestors'][0]:
cL.append(name)
if debug: print cL
return cL

# compute y for internal nodes
# must fill these from the bottom up
L = i_node_names[:]
#L.sort() # sorts 0,1,2 etc. then root
def f(name): # sort by x-value !
nD = D[name]
return nD['x']
L = sorted(L,key=f,reverse=True)

if debug: print 'finding children'
for name in L:
nD = D[name]
cL = get_child_node_names(parent=name)
nD['immediate_children'] = cL
yL = [D[child]['y'] for child in cL]
y0 = min(yL)
y1 = max(yL)
if len(yL) == 3:
y = sorted(yL)[1]
else:
y = mean([y0,y1])
nD = D[name]
nD['y'] = y
nD['y_top'] = y1
nD['y_bott'] = y0
if debug: print

# sanity check for ancestor lists
for name in all_node_names:
if name == 'root': continue
assert D[name]['ancestors'][-1] == 'root'
return D

def show(D):
e_node_names = D['meta']['e_node_names']
i_node_names = D['meta']['i_node_names']
for name in e_node_names + i_node_names:
print name
nD = D[name]
for k in nD:
if k == 'node' and not nD['is_external']:
continue
print k, nD[k]
print

if __name__ == '__main__':
tree_string = None
if len(sys.argv) > 1:
fn = sys.argv[1]
try:
tree_string = load_data(fn)
except IOError:
print 'could not open file:', fn
sys.exit()
if not tree_string:
print 'making default tree', '\n'
tree_string = s.strip()
D = make_tree_dict(ts=tree_string,debug=True)