Chapter 9
References
http://www.pyrosequencing.com/DynPage.aspx?id=7454 - Concise explanation of pyrosequencing, including an animation.
Sequencing of James D. Watson's genome
Margulies et al. (2005) - Description of a highly parallel method for whole genome sequencing based on pyrosequencing
http://www.454.com/products-solutions/how-it-works/index.asp - Concise description of 454's sequencing technology
FIGURES
Figures 9.1 - 9.3 were done in Powerpoint (TM), so no "R" scripts available for these
Figures 9.4 and 9.5
You
will first need some scripts (mostly adapted from the "R" package
"ape"), and you will need to download the "R" package "ape" (that's for
"Analyses of Phylogenetics and Evolution").
plot.tree = function (phy,mut.rate=0,x.offset=0,muts=T)
{
N <- length(phy$tip.label)
x.pos = c(1:N,rep(NA,N-2))+x.offset
y.pos = rep(0,2*N-1)
for(i in seq(2*N-1,N+1,-1)){
parent = which(phy$edge[,1]==i)
point=phy$edge[parent,2]
x.pos[i]=mean(x.pos[point])
parent = parent[1]
child=phy$edge[parent,2]
height=phy$edge.length[parent]
y.pos[i]=y.pos[i]+height
while(child>N){
parent = which(phy$edge[,1]==child)[1]
child=phy$edge[parent,2]
height=phy$edge.length[parent]
y.pos[i]=y.pos[i]+height
}
}
nodes=cbind(x.pos,y.pos)
max.depth=max(nodes[,2])*1.2
mut.rate=15/max.depth
top=phy$edge[1,1]
lines(rep(nodes[top,1],2),c(10^10,nodes[top,2]))
sto.muts=0
for(i in 1:(2*N-2)){
top=phy$edge[i,1]
bot=phy$edge[i,2]
lines(c(nodes[top,1],nodes[bot,1],nodes[bot,1]),
c(nodes[top,2],nodes[top,2],nodes[bot,2]))
if(muts==T){
n.mut=rpois(1,phy$edge.length[i]*mut.rate)
sto.muts[i]=n.mut
if(n.mut>0){
y.top=nodes[top,2]
y.bot=nodes[bot,2]
nudge=(y.top-y.bot)/7
y=runif(n.mut,y.bot+nudge,y.top-nudge)
points(rep(nodes[bot,1],n.mut),y,pch=20)
}
}
}
if(muts==F) return(NULL)
ancestors=mrca(phy)[1:36,1:36]
freqs=rep(0,60)
for(i in 1:35){
for(j in (i+1):36){
common=ancestors[i,j]
diffs=0
diffs = diffs + sto.muts[which(phy$edge[,2]==i)]
parent.L = phy$edge[which(phy$edge[,2]==i),1]
while(parent.L!=common) {
diffs = diffs + sto.muts[which(phy$edge[,2]==parent.L)]
parent.L = phy$edge[which(phy$edge[,2]==parent.L),1]
}
diffs = diffs + sto.muts[which(phy$edge[,2]==j)]
parent.R = phy$edge[which(phy$edge[,2]==j),1]
while(parent.R!=common) {
diffs = diffs + sto.muts[which(phy$edge[,2]==parent.R)]
parent.R = phy$edge[which(phy$edge[,2]==parent.R),1]
}
freqs[diffs+1]=freqs[diffs+1]+1
}
}
windows()
dev.next()
tot=sum(freqs)
running=cumsum(freqs)
top=min(which(running==tot))
y=freqs[1:top]/sum(freqs)*100
my.hist(y)
}
my.rcoal2 = function (n=20, curr.popsize=1000,old.popsize=1,when=10)
{
tip.label = NULL
nbr <- 2 * n - 2
edge <- matrix(NA, nbr, 2)
x=0
for (i in seq(n,2,-1)){
x[(n+1)-i]=-2*curr.popsize*log(1-runif(1))/(i*(i-1))
}
rescale=which(cumsum(x)>when)
x[rescale]=x[rescale]*old.popsize/curr.popsize
edge.length <- numeric(nbr)
h <- numeric(2 * n - 1)
node.height <- cumsum(x)
pool <- 1:n
nextnode <- 2L * n - 1L
for (i in 1:(n - 1)) {
y <- sample(pool, size = 2)
ind <- (i - 1) * 2 + 1:2
edge[ind, 2] <- y
edge[ind, 1] <- nextnode
edge.length[ind] <- node.height[i] - h[y]
h[nextnode] <- node.height[i]
pool <- c(pool[!pool %in% y], nextnode)
nextnode <- nextnode - 1L
}
phy <- list(edge = edge, edge.length = edge.length)
if (is.null(tip.label))
tip.label <- paste("t", 1:n, sep = "")
phy$tip.label <- sample(tip.label)
phy$Nnode <- n - 1L
class(phy) <- "phylo"
phy <- reorder(phy)
phy$edge[phy$edge[, 2] <= n, 2] <- 1:n
phy
}
get.tree.depth = function (tree)
{
depth=0
N=NROW(tree$edge)
for(i in 1:N){
ichild=tree$edge[i,2]
depth=depth+tree$edge.length[i]
if(ichild<=20) return(depth)
}
return(depth)
}
Fig9.4and5=function (popsize=100)
{
library(ape)
tree=my.rcoal2(n=20,popsize,popsize,0)
depth = get.tree.depth(tree)
plot(c(0,20,0,20),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='')
plot.tree(tree,muts=F)
axis(2)
}
Figure 9.4 is drawn using the default value of 100 for the
female population size, while Figure 9.5 is drawn with a female
population size of 1,000.
Figure 9.6 - In the textbook,
this figure shows the simulations from Fig. 9.4 and 9.5 on the same
scale. If you want to do something like this then use the
"set.seed" command so you get the same simulations in Figure 9.6 (just
un-comment the "set.seed" statements below). Be sure to set the
seeds to these values before running the Fig9.4and5 script.
Fig9.6 = function (popsize1=100,popsize2=1000)
{
library(ape)
# set.seed(1234)
tree1=my.rcoal2(n=20,popsize1,popsize1,0)
depth1 = get.tree.depth(tree1)
# set.seed(5678)
tree2=my.rcoal2(n=20,popsize2,popsize2,0)
depth2 = get.tree.depth(tree2)
max.depth=max(c(depth1,depth2))
plot(c(0,46,0,46),c(max.depth,0,max.depth,0),type='n',axes=F,xlab='',ylab='')
plot.tree(tree1,muts=F)
plot.tree(tree2,x.offset=25,muts=F)
axis(2)
}
Figure 9.7 - This one adds
mutation and (assuming the infinite sites model - i.e., each mutation
is unique) draws the mismatch distribution (as in Figure 9.8).
my.hist = function (y)
{
n.rects=length(y)
plot(c(-.5,n.rects),c(0,max(y)),type='n',axes=F,xlab='No. of Differences',ylab='Percent')
for(i in 1:n.rects){
rect(i-1.5,0,i-.5,y[i],col='gray')
}
axis(1,at=0:100,pos=0)
axis(2)
}
Fig9.7 = function ()
{
library(ape)
# set.seed(4321)
tree=my.rcoal2(n=36,100,100,0)
depth=get.tree.depth(tree)
plot(c(0,37,0,37),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='')
plot.tree(tree,muts=T)
}
Figures 9.9 and 9.10 - This
script draws the Haida mismatch and Bella Coola mismatch
distributions. It also draws a model distribution under
population expansion. The textbook used Arlequin to draw the
model distribution, whicl the "R" script is using an earlier method due
to Alan Rogers. To draw these figures you will first need to get
the data into "R," by copying the below text into your "R" "Gui".
Tab_9.5 = structure(list(Seq = 1:17, P1 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P2 = c(4L, 4L, 4L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P3 = c(4L,
4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 4L
), P4 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), P5 = c(1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 3L, 3L, 3L, 3L, 3L), P6 = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L), P7 = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L), P8 = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L
), P9 = c(2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), P10 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 2L, 4L, 2L), P11 = c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L), P12 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L
), P13 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L,
4L, 4L, 4L, 2L), P14 = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P15 = c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L), P16 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L
), P17 = c(4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 4L, 4L,
4L, 4L, 2L, 2L), P18 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P19 = c(2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P20 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L
), P21 = c(1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 3L), P22 = c(4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L, 2L, 4L), P23 = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P24 = c(2L,
2L, 2L, 2L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L
), P25 = c(2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 2L, 2L,
2L, 2L, 2L, 4L), Haida = c(1L, 20L, 2L, 0L, 10L, 1L, 1L, 1L,
1L, 3L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), BellaCoola = c(8L, 3L, 5L,
2L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 5L, 2L, 1L, 6L, 3L, 2L)), .Names = c("Seq",
"P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10",
"P11", "P12", "P13", "P14", "P15", "P16", "P17", "P18", "P19",
"P20", "P21", "P22", "P23", "P24", "P25", "Haida", "BellaCoola"
), class = "data.frame", row.names = c(NA, -17L))
Fig9.9and10 = function (which=1)
{
# NOTE: the default value of "which=1" will draw the Haida distribution. "which=2" draws the Bella Coola.
seq=as.matrix(Tab_9.5[,-c(1,27,28)],nc=25)
count=Tab_9.5[,which+26]
mis.match=function(seq1,seq2){
diff=0
for(i in 1:25) if(seq1[i]!=seq2[i]) diff=diff+1
return(diff)
}
n=NROW(seq)
count.difs=rep(0,26)
for(i in 1:(n-1)){
for(j in (i+1):n){
sto=mis.match(seq[i,],seq[j,])
count.difs[sto+1]=count.difs[sto+1]+count[i]*count[j]
}
}
for(i in 1:n){
if(count[i]>0) count.difs[1]=count.difs[1]+count[i]*(count[i]-1)/2
}
obs=count.difs/sum(count.difs)
plot(0:14,100*obs[1:15],type='l',xlab='Number of Base Differences',
ylab='Percent')
points(0:14,100*obs[1:15],pch=19)
difs=0:25
m=as.numeric(count.difs%*%difs/sum(count.difs))
v=as.numeric(count.difs%*%(difs-m)^2/sum(count.difs))
Theta0=sqrt(v-m)
tau=m-Theta0
expect=rep(0,14)
for(i in 0:15){
expect[i+1]=Theta0^i/((1+Theta0)^(i+1))
acum=0
for(j in 0:i){
acum=acum+((1+Theta0)/Theta0)^j*(tau^j*exp(-tau))/factorial(j)
}
expect[i+1]=expect[i+1]*acum
}
lines(0:14,100*expect[1:15],lty=2)
}
Figure 9.12 - This can be used to simulate a number of different scenarios.
Fig9.12 = function (old.popsize=2,when=8)
{
library(ape)
# set.seed(4321)
tree<-my.rcoal2(n=36,1000,old.popsize,when)
depth=get.tree.depth(tree)
plot(c(0,37,0,37),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='',
main=paste('Old size = ',old.popsize, 'Expansion at ',when))
mut.rate=(old.popsize-2)*(0.01-.25)/998+.25
plot.tree(tree)
}