Chapter 9

References

http://www.pyrosequencing.com/DynPage.aspx?id=7454 - Concise explanation of pyrosequencing, including an animation.

Margulies et al. (2005) - Description of a highly parallel method for whole genome sequencing based on pyrosequencing

http://www.454.com/products-solutions/how-it-works/index.asp - Concise description of 454's sequencing technology

FIGURES

Figures 9.1 - 9.3 were done in Powerpoint (TM), so no "R" scripts available for these

Figures 9.4 and 9.5

You will first need some scripts (mostly adapted from the "R" package "ape"), and you will need to download the "R" package "ape" (that's for "Analyses of Phylogenetics and Evolution").

plot.tree = function (phy,mut.rate=0,x.offset=0,muts=T)
{
    N <- length(phy$tip.label)
    x.pos = c(1:N,rep(NA,N-2))+x.offset
    y.pos = rep(0,2*N-1)

    for(i in seq(2*N-1,N+1,-1)){
    parent = which(phy$edge[,1]==i)
    point=phy$edge[parent,2]
    x.pos[i]=mean(x.pos[point])

    parent = parent[1]
    child=phy$edge[parent,2]
    height=phy$edge.length[parent]
    y.pos[i]=y.pos[i]+height

    while(child>N){
    parent = which(phy$edge[,1]==child)[1]
    child=phy$edge[parent,2]
    height=phy$edge.length[parent]
    y.pos[i]=y.pos[i]+height
    }
    }

    nodes=cbind(x.pos,y.pos)
    max.depth=max(nodes[,2])*1.2
    mut.rate=15/max.depth

    top=phy$edge[1,1]
    lines(rep(nodes[top,1],2),c(10^10,nodes[top,2]))
    sto.muts=0
    for(i in 1:(2*N-2)){
    top=phy$edge[i,1]
    bot=phy$edge[i,2]
    lines(c(nodes[top,1],nodes[bot,1],nodes[bot,1]),
          c(nodes[top,2],nodes[top,2],nodes[bot,2]))
    if(muts==T){
    n.mut=rpois(1,phy$edge.length[i]*mut.rate)
    sto.muts[i]=n.mut
    if(n.mut>0){
    y.top=nodes[top,2]
    y.bot=nodes[bot,2]
    nudge=(y.top-y.bot)/7
    y=runif(n.mut,y.bot+nudge,y.top-nudge)
    points(rep(nodes[bot,1],n.mut),y,pch=20)
    }

    }
    }
    if(muts==F) return(NULL)

    ancestors=mrca(phy)[1:36,1:36]
    freqs=rep(0,60)
    for(i in 1:35){
    for(j in (i+1):36){
    common=ancestors[i,j]

    diffs=0

    diffs = diffs + sto.muts[which(phy$edge[,2]==i)]
    parent.L = phy$edge[which(phy$edge[,2]==i),1]
    while(parent.L!=common) {
     diffs = diffs + sto.muts[which(phy$edge[,2]==parent.L)]
     parent.L = phy$edge[which(phy$edge[,2]==parent.L),1]
    }

    diffs = diffs + sto.muts[which(phy$edge[,2]==j)]

    parent.R = phy$edge[which(phy$edge[,2]==j),1]
    while(parent.R!=common) {
    diffs = diffs + sto.muts[which(phy$edge[,2]==parent.R)]
    parent.R = phy$edge[which(phy$edge[,2]==parent.R),1]
    }
    freqs[diffs+1]=freqs[diffs+1]+1
}
}
   windows()
   dev.next()
   tot=sum(freqs)
   running=cumsum(freqs)
   top=min(which(running==tot))
   y=freqs[1:top]/sum(freqs)*100
   my.hist(y)
}

my.rcoal2 = function (n=20, curr.popsize=1000,old.popsize=1,when=10)
{
    tip.label = NULL
    nbr <- 2 * n - 2
    edge <- matrix(NA, nbr, 2)
    x=0
    for (i in seq(n,2,-1)){
    x[(n+1)-i]=-2*curr.popsize*log(1-runif(1))/(i*(i-1))
    }

    rescale=which(cumsum(x)>when)
    x[rescale]=x[rescale]*old.popsize/curr.popsize

        edge.length <- numeric(nbr)
        h <- numeric(2 * n - 1)
        node.height <- cumsum(x)
        pool <- 1:n
        nextnode <- 2L * n - 1L
        for (i in 1:(n - 1)) {
            y <- sample(pool, size = 2)
            ind <- (i - 1) * 2 + 1:2
            edge[ind, 2] <- y
            edge[ind, 1] <- nextnode
            edge.length[ind] <- node.height[i] - h[y]
            h[nextnode] <- node.height[i]
            pool <- c(pool[!pool %in% y], nextnode)
            nextnode <- nextnode - 1L
        }

    phy <- list(edge = edge, edge.length = edge.length)
    if (is.null(tip.label))
        tip.label <- paste("t", 1:n, sep = "")
    phy$tip.label <- sample(tip.label)
    phy$Nnode <- n - 1L
    class(phy) <- "phylo"
    phy <- reorder(phy)
    phy$edge[phy$edge[, 2] <= n, 2] <- 1:n
    phy
}

get.tree.depth = function (tree)
{
depth=0
N=NROW(tree$edge)
for(i in 1:N){
ichild=tree$edge[i,2]
depth=depth+tree$edge.length[i]
if(ichild<=20) return(depth)
}
return(depth)
}

Fig9.4and5=function (popsize=100)
{
   library(ape)
   tree=my.rcoal2(n=20,popsize,popsize,0)
   depth = get.tree.depth(tree)
   plot(c(0,20,0,20),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='')
   plot.tree(tree,muts=F)
   axis(2)
}

Figure 9.4 is drawn using the default value of 100 for the female population size, while Figure 9.5 is drawn with a female population size of 1,000.

Figure 9.6 - In the textbook, this figure shows the simulations from Fig. 9.4 and 9.5 on the same scale. If you want to do something like this then use the "set.seed" command so you get the same simulations in Figure 9.6 (just un-comment the "set.seed" statements below). Be sure to set the seeds to these values before running the Fig9.4and5 script.

Fig9.6 = function (popsize1=100,popsize2=1000)
{
   library(ape)
# set.seed(1234)
   tree1=my.rcoal2(n=20,popsize1,popsize1,0)
   depth1 = get.tree.depth(tree1)
# set.seed(5678)
   tree2=my.rcoal2(n=20,popsize2,popsize2,0)
   depth2 = get.tree.depth(tree2)
   max.depth=max(c(depth1,depth2))
   plot(c(0,46,0,46),c(max.depth,0,max.depth,0),type='n',axes=F,xlab='',ylab='')
   plot.tree(tree1,muts=F)
   plot.tree(tree2,x.offset=25,muts=F)
   axis(2)
}

Figure 9.7 - This one adds mutation and (assuming the infinite sites model - i.e., each mutation is unique) draws the mismatch distribution (as in Figure 9.8).

my.hist = function (y)
{
n.rects=length(y)
plot(c(-.5,n.rects),c(0,max(y)),type='n',axes=F,xlab='No. of Differences',ylab='Percent')
for(i in 1:n.rects){
rect(i-1.5,0,i-.5,y[i],col='gray')
}
axis(1,at=0:100,pos=0)
axis(2)
}

Fig9.7 = function ()
{
   library(ape)
#   set.seed(4321)
   tree=my.rcoal2(n=36,100,100,0)
   depth=get.tree.depth(tree)
   plot(c(0,37,0,37),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='')
   plot.tree(tree,muts=T)
}

Figures 9.9 and 9.10 - This script draws the Haida mismatch and Bella Coola mismatch distributions. It also draws a model distribution under population expansion. The textbook used Arlequin to draw the model distribution, whicl the "R" script is using an earlier method due to Alan Rogers. To draw these figures you will first need to get the data into "R," by copying the below text into your "R" "Gui".

Tab_9.5 = structure(list(Seq = 1:17, P1 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P2 = c(4L, 4L, 4L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P3 = c(4L,
4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 4L
), P4 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), P5 = c(1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 3L, 3L, 3L, 3L, 3L), P6 = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L), P7 = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L), P8 = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L
), P9 = c(2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), P10 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 2L, 4L, 2L), P11 = c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L), P12 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L
), P13 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L,
4L, 4L, 4L, 2L), P14 = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P15 = c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L), P16 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L
), P17 = c(4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 4L, 4L,
4L, 4L, 2L, 2L), P18 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), P19 = c(2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P20 = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L
), P21 = c(1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 3L), P22 = c(4L, 4L, 2L, 2L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L, 2L, 4L), P23 = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), P24 = c(2L,
2L, 2L, 2L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L
), P25 = c(2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 2L, 2L,
2L, 2L, 2L, 4L), Haida = c(1L, 20L, 2L, 0L, 10L, 1L, 1L, 1L,
1L, 3L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), BellaCoola = c(8L, 3L, 5L,
2L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 5L, 2L, 1L, 6L, 3L, 2L)), .Names = c("Seq",
"P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10",
"P11", "P12", "P13", "P14", "P15", "P16", "P17", "P18", "P19",
"P20", "P21", "P22", "P23", "P24", "P25", "Haida", "BellaCoola"
), class = "data.frame", row.names = c(NA, -17L))

Fig9.9and10 = function (which=1)
{
# NOTE: the default value of "which=1" will draw the Haida distribution. "which=2" draws the Bella Coola.
seq=as.matrix(Tab_9.5[,-c(1,27,28)],nc=25)
count=Tab_9.5[,which+26]
mis.match=function(seq1,seq2){
diff=0
for(i in 1:25) if(seq1[i]!=seq2[i]) diff=diff+1
return(diff)
}
n=NROW(seq)
count.difs=rep(0,26)
for(i in 1:(n-1)){
for(j in (i+1):n){
    sto=mis.match(seq[i,],seq[j,])
    count.difs[sto+1]=count.difs[sto+1]+count[i]*count[j]
}
}
for(i in 1:n){
if(count[i]>0) count.difs[1]=count.difs[1]+count[i]*(count[i]-1)/2
}
obs=count.difs/sum(count.difs)
plot(0:14,100*obs[1:15],type='l',xlab='Number of Base Differences',
   ylab='Percent')
points(0:14,100*obs[1:15],pch=19)
difs=0:25
m=as.numeric(count.difs%*%difs/sum(count.difs))
v=as.numeric(count.difs%*%(difs-m)^2/sum(count.difs))
Theta0=sqrt(v-m)
tau=m-Theta0
expect=rep(0,14)
for(i in 0:15){
expect[i+1]=Theta0^i/((1+Theta0)^(i+1))
acum=0
for(j in 0:i){
    acum=acum+((1+Theta0)/Theta0)^j*(tau^j*exp(-tau))/factorial(j)
}
expect[i+1]=expect[i+1]*acum
}
lines(0:14,100*expect[1:15],lty=2)
}

Figure 9.12 - This can be used to simulate a number of different scenarios.

Fig9.12 = function (old.popsize=2,when=8)
{
   library(ape)
#   set.seed(4321)
   tree<-my.rcoal2(n=36,1000,old.popsize,when)
   depth=get.tree.depth(tree)
   plot(c(0,37,0,37),c(depth,0,depth,0),type='n',axes=F,xlab='',ylab='',
   main=paste('Old size = ',old.popsize, 'Expansion at ',when))
   mut.rate=(old.popsize-2)*(0.01-.25)/998+.25
   plot.tree(tree)
}