Latent Traits ("Transition Analysis")

The link above is for a demonstration (using "R") of transition analysis and some other stuff.  The kicker at the end is the the R script: LM.test which is a specification test for the cumulative probit with age on straight scale or log scale.  But today, we'll be looking at the same model using a latent trait approach.

 
Getting Some Data

It's time to grab us some more data.  You know the drill now:

Buckberry.csv

Buckberry=read.csv('Buckberry.csv')

Now check to see that this really exists in your workspace (where anything in green is what I actually typed) :

> Buckberry=read.csv('Buckberry.csv')

> Buckberry[1:10,]
   Age TO ST MI MA AP Sex
1   16  1  1  1  1  2   F
2   17  2  1  1  1  1   F
3   19  1  2  1  1  1   F
4   23  3  2  2  1  2   F
5   27  1  1  3  1  1   F
6   27  2  2  2  1  1   F
7   29  4  3  3  1  2   F
8   29  2  3  3  1  2   F
9   30  3  3  2  1  2   F
10  34  4  3  3  1  2   F


This is reference data on 180 cases from
Buckberry and Chamberlain (2002)  for scores on the auricular surface.  The variables are:

  1. Tranverse organization
  2. Surface texture
  3. Microporosity
  4. Macroporosity
  5. Apical changes
Just for "fun", let's take a quick look at tabulations of the data:

> for(i in 2:7) print(table(Buckberry[,i]))

[1]  1  2  3  4  5
 3 27 31 96 23

  1   2   3   4   5
  4  18 100  54   4

  1   2   3
 13  31 136

 1  2  3
92 58 30

  1   2   3
 39 118  23

 F  M
94 86


All looks good, except maybe having only three cases in stage 1 of "transverse organization" is a bit of a problem, so let's recode a bit (collapse stage 1 and 2 into a single first stage):

 Buck2 = cbind(Buckberry[,1],c(1,1,2,3,4)[Buckberry[,2]],Buckberry[,3:7])
 colnames(Buck2) = colnames(Buckberry)

Now let's start with a terrible assumption, which is that transverse organization bears no relationship to age.  If this is the case, then we can simply model transverse organization by placing three thresholds on a standard normal distribution.  Let's build this up in little baby steps:

> table(Buck2$TO)
 1  2  3  4
30 31 96 23

> cumsum(table(Buck2$TO))
  1   2   3   4
 30  61 157 180

> cumsum(table(Buck2$TO))/180
        1         2         3         4
0.5661838 0.6326533 0.8084564 0.8413447

> qnorm(cumsum(table(Buck2$TO))/180)
         1          2          3          4
-0.9674216 -0.4154975  1.1369587        Inf

That was the tedious way, so let's jump in feet first and do a probit regression.

library(MASS)
polr(factor(Buck2[,2])~1,method='probit')

A picture is worth a thousand words (?),  so let's plot this:

plot1.latent = function ()
{
N = NROW(Buck2)
plot(seq(-3,3,.01),dnorm(seq(-3,3,.01)),type='l',
     xlab='\"Transverse Organizationess\"',ylab='Density',ylim=c(0,.5))
threshes=qnorm(cumsum(table(Buck2$TO))/N)[-4]
for(i in 1:3) lines(rep(threshes[i],2),c(0,1))
percents = noquote(paste(round(table(Buck2$TO)/N*100,1),'%',sep=''))
x = c(-3,threshes,3)
moving.avg = function(x){
  n=length(x)
  x.ma=0
  for(i in 1:(n-1)){
    x.ma[i]=(x[i]+x[i+1])/2
  }
  return(x.ma)
}
x = moving.avg(x)
text(x,rep(0.01,4),percents)
text(x,rep(0.5,4),c('A lot','Some','Less','None'))
}

But now we want to start considering age as an explanatory variable for our latent trait (of "transverse organizationess").  First make sure that you have the package VGAM downloaded:

library(VGAM)
set.seed(123456)

vglm(Buck2[,2]~runif(180,-1000,1000),cumulative(link='probit',parallel = T, reverse = F))

Call:
vglm(formula = Buck2[, 2] ~ runif(180, -1000, 1000), family = cumulative(link = "probit",
    parallel = T, reverse = F))

Coefficients:
          (Intercept):1           (Intercept):2           (Intercept):3 runif(180, -1000, 1000)
          -0.9683654868           -0.4100035770            1.1569758607            0.0002170696

Degrees of Freedom: 540 Total; 536 Residual
Residual deviance: 429.382
Log-likelihood: -214.691

OK, that was a silly example, and I lied about using age.  Instead, we regressed on random uniform deviates between -1000 and 1000.  Note how our thresholds from the previous graph end up in the  "intercepts" from this cumulative probit model.   Now let's try this for real, using log age:

> vglm(Buck2[,2]~log(Buck2[,1]),cumulative(link='probit',parallel = T, reverse = F))
Call:
vglm(formula = Buck2[, 2] ~ log(Buck2[, 1]), family = cumulative(link = "probit",
    parallel = T, reverse = F))

Coefficients:
  (Intercept):1   (Intercept):2   (Intercept):3 log(Buck2[, 1])
       5.364422        6.052758        7.786678       -1.633510

Degrees of Freedom: 540 Total; 536 Residual
Residual deviance: 386.6864
Log-likelihood: -193.3432


And let's plot this:

plot2.latent = function ()
{
threshes = c(5.364422, 6.052758, 7.786678)
slope = 1.633510
ages = c(20,35,50)
plot(ages,slope*log(ages),ylim=c(0,9),xlim=c(0,50),
   type='n',xlab='Age',ylab='Latent trait',
   main='Zero Intercept Parameterization')
for(i in 1:3){
   mu = log(ages[i])*slope
   a = seq(mu-3,mu+3,0.01)
   dens = dnorm(a,mu,1)
   lines(ages[i]-20*dens,a)
   lines(c(ages[i]-20*dnorm(0),ages[i]),rep(mu,2),lty=2)
 }
for(i in 1:3){
 lines(c(-10,120),rep(threshes[i],2),lty=2,lwd=2)
 lines(rep(ages[i],2),c(0,100))
 points(ages[i],log(ages[i])*slope,pch=19)
}
ages=1:60
lines(ages,log(ages)*slope)
}

That was the "zero intercept parameterization," but there's also a "zero first threshold parameterization":

plot2b.latent = function ()
{
threshes = c(5.364422, 6.052758, 7.786678)
intercept = threshes[1]
threshes = threshes - intercept
slope = 1.633510
ages = c(20,35,50)
plot(ages,slope*log(ages),ylim=c(-5.4,4),xlim=c(0,50),
   type='n',xlab='Age',ylab='Latent trait',
   main='Zero First Threshold Parameterization')
for(i in 1:3){
   mu = log(ages[i])*slope - intercept
   a = seq(mu-3,mu+3,0.01)
   dens = dnorm(a,mu,1)
   lines(ages[i]-20*dens,a)
   lines(c(ages[i]-20*dnorm(0),ages[i]),rep(mu,2),lty=2)
 }
for(i in 1:3){
 lines(c(0,120),rep(threshes[i],2),lty=2,lwd=2)
 lines(rep(ages[i],2),c(-100,100))
 points(ages[i],log(ages[i])*slope-intercept,pch=19)
}
ages=1:50
lines(ages,log(ages)*slope-intercept)
}


Now, why did we use log age instead of the straight scale?  It is easiest to answer this by slipping into the transition analysis paradigm.

plot3.latent = function ()
{
library(VGAM)
par(mfrow=c(2,2))
threshes = c(5.364422, 6.052758, 7.786678)
slope = 1.633510
sdev = 1/slope
mu = threshes*sdev
age = 1:100
plot(age,dlnorm(age,mu[1],sdev),type='l',xlab='Age-at-transition',
    ylab='Density',axes=F,main='Log normal transition')
box()
axis(1)
for(i in 2:3) lines(age,dlnorm(age,mu[i],sdev))

parms = as.vector(coef(vglm(Buck2[,2]~Buck2[,1],cumulative(link='probit',parallel = T,
     reverse = F))))
sdev = 1/(-parms[4])
mu = parms[1:3]*sdev
plot(age,dnorm(age,mu[1],sdev),type='l',xlab='Age-at-transition',
    ylab='Density',axes=F,main='Normal transition')
box()
axis(1)
for(i in 2:3) lines(age,dnorm(age,mu[i],sdev))


parms = as.vector(coef(vglm(Buck2[,2]~Buck2[,1],cumulative(link='probit',parallel = F,
     reverse = F))))
sdev = 1/(-parms[4:6])
mu = parms[1:3]*sdev
plot(age,dnorm(age,mu[1],sdev[1]),type='l',xlab='Age-at-transition',
    ylab='Density',axes=F,main='Normal with variable sd')
box()
axis(1)
for(i in 2:3) lines(age,dnorm(age,mu[i],sdev[i]))

plot(age,1-pnorm(age,mu[1],sdev[1]),type='l',xlab='Age-at-transition',
    ylab='Cumulative Density',main='Normal with variable sd',ylim=c(0,1))
for(i in 2:3) lines(age,1-pnorm(age,mu[i],sdev[i]))
lines(c(0,100),rep(1,2))

cat('\n\n')
cat(noquote('type \"par(mfrow=c(1,1))\" or you will be stuck with four plots per page'))
cat('\n\n')
}

OK, we could integrate across a uniform prior for age to get the posterior density of age conditional on observing that someone was in, say, the second stage.   But let's save ourselves the trouble (?) and start heading off to OpenBUGS.  But first, a parting glance at R.  Oh, and the package "alabama" is for "Augmented Lagrangian Adaptive Barrier Minimization Algorithm," not for the great state of Alabama.

my.probit=function(x=log(Buck2[,1]),y=Buck2[,2])
{
library(alabama)
N = length(y)
N.stages = length(table(y))
LK = function(theta){
  lnLK = 0
  alpha = theta[1:(N.stages-1)]
  beta = theta[N.stages]
  for(i in 1:N){
     if(y[i]==1) lnLK = lnLK + log(pnorm(alpha[1]-x[i]*beta)) 
     if(y[i]>1 & y[i]<N.stages){
       temp1 = pnorm(alpha[y[i]]-x[i]*beta)
       temp2 = pnorm(alpha[y[i]-1]-x[i]*beta)
       temp =  -743.7469
       if(temp1-temp2>1e-323) temp = log(temp1-temp2)
       lnLK = lnLK + temp
       }
     if(y[i]==N.stages) lnLK = lnLK + log(1-pnorm(alpha[N.stages-1]-x[i]*beta))
  }
  return(-lnLK)
}

hin=function(xx){
 h=0
 for(i in 2:(N.stages-1)){
  h[i-1]=xx[i]-xx[i-1]
 }
 h[N.stages-1]=xx[N.stages]
}

start.theta = c(qnorm(cumsum(table(y))/N)[1:(N.stages-1)],.01)
sto = constrOptim.nl(par=start.theta,fn=LK,hin=hin,control.outer=list(trace=F))
return(list(par=sto$par,lnLK=-sto$val,conv=sto$conv))
}

model
{
for(i in 1:4){
age[i] ~ dunif(0,200)
log.age[i] <- log(age[i])
pred[i] <-1.622458*log.age[i]
y.star[i] ~ dnorm(pred[i],1) I(thresh[i], thresh[i+1])
}
}

#data
list(thresh=c(-100,5.321429, 6.008817, 7.741257,100))

We could use Markov Chain Monte Carlo (MCMC) in order to estimate the  cumulative probit model in "R."  The code here is from  Scott Lynch's (2007) excellent text, specifically from page 222.  The code and data from his book are available as a zip file from: http://www.princeton.edu/~slynch/index_files/Page781.htm

MCMC.probit = function ()
{
x=cbind(rep(1,180),log(Buck2[,1]))
y=Buck2[,2]
N = NROW(y)
t=matrix(0,5)
t[1]=-Inf; t[2]<-0; t[5]<-Inf
t[3]=qnorm(sum(y<=2)/N,-qnorm(sum(y==1)/N,0,1),1)
t[4]=qnorm(sum(y<=3)/N,-qnorm(sum(y==1)/N,0,1),1)

b=matrix(0,2); vb=solve(t(x)%*%x); ch=chol(vb)

sto = matrix(rep(0,4*1000),nc=4)

ip=0
for(i in 2:10000){
#simulate latent data from truncated normal distributions
xb=as.matrix(x%*%b)
ystar=qnorm(runif(N,min=pnorm(t[y],xb,1),
                          max=pnorm(t[y+1],xb,1)),mean=xb,1)

#simulate thresholds
for(k in 3:4){t[k]=runif(1,min=max(ystar[y==k-1]), max=min(ystar[y==k]))}

#simulate beta vector from appropriate mvn
b<-vb%*%(t(x)%*%ystar) + t((rnorm(2,mean=0,sd=1))%*%ch)


if(i%%10==0){
ip=ip+1
sto[ip,]=c(b[2],-b[1],t[3]-b[1],t[4]-b[1])
}
}
return(sto)
}

And now,  though it is rather "kludgy"  for this, we can do this in OpenBUGS as well as get full posterior densities for age in each of the four stages:

model
{

# Individuals are sorted on stage, so:

# Loop for those in stage 1
  for(j in bot[1]:(bot[2]-1)){
      pred[j]<- B[1] + B[2]*ln.age[j]
      y.star[j] ~ dnorm(pred[j],1)I(-100,0)
    }

# Loop for those in stage 2
  for(j in bot[2]:(bot[3]-1)){
      pred[j]<- B[1] + B[2]*ln.age[j]
      y.star[j] ~ dnorm(pred[j],1)I(0,thresh[1])
    }
    hi.2 <- ranked(y.star[bot[2]:(bot[3]-1)], counts[2])

# Loop for those in stage 3
  for(j in bot[3]:(bot[4]-1)){
      pred[j]<- B[1] + B[2]*ln.age[j]
      y.star[j] ~ dnorm(pred[j],1) I(thresh[1],thresh[2])
    }
    lo.3 <- ranked(y.star[bot[3]:(bot[4]-1)], 1) 
    hi.3 <- ranked(y.star[bot[3]:(bot[4]-1)], counts[3])

# Loop for those in stage 4

  for(j in bot[4]:(bot[5]-1)){
      pred[j]<- B[1] + B[2]*ln.age[j]
      y.star[j] ~ dnorm(pred[j],1) I(thresh[2],100)
    }
    lo.4 <- ranked(y.star[bot[4]:(bot[5]-1)], 1)

# Sample new thresholds for stage 2/3 and 3/4

  thresh[1] ~ dunif(hi.2,lo.3)
  thresh[2] ~ dunif(hi.3,lo.4)

# Sample regression coefficients (intercept and slope)

  sum.y.star <- sum(y.star[])
  in.prod.xy <- inprod(ln.age[],y.star[])
  pred.B[1] <- vb[1,1]*sum.y.star + vb[1,2]*in.prod.xy
  pred.B[2] <- vb[2,1]*sum.y.star + vb[2,2]*in.prod.xy
  B[1:2] ~ dmnorm(pred.B[1:2],xpx[1:2,1:2])

# Get posterior density of age for someone in each of the 4 stages

  age[1] ~ dunif(0,200)
  log.age[1] <- log(age[1])
  pred.1 <- B[1] + B[2]*log.age[1]
  y.1 ~ dnorm(pred.1,1) I(-100,0)

  age[2] ~ dunif(0,200)
  log.age[2] <- log(age[2])
  pred.2 <- B[1] + B[2]*log.age[2]
  y.2 ~ dnorm(pred.2,1) I(0, thresh[1])

  age[3] ~ dunif(0,200)
  log.age[3] <- log(age[3])
  pred.3 <- B[1] + B[2]*log.age[3]
  y.3 ~ dnorm(pred.3,1) I(thresh[1],thresh[2])

  age[4] ~ dunif(0,200)
  log.age[4] <- log(age[4])
  pred.4 <- B[1] + B[2]*log.age[4]
  y.4 ~ dnorm(pred.4,1) I(thresh[2],100)
}

Data
list(ln.age = c(2.772589,2.833213,2.944439,3.295837,3.295837,3.367296,
3.555348,3.555348,3.637586,3.713572,3.806662,3.871201,3.970292,
3.970292,4.043051,4.174387,2.772589,3.044522,3.218876,3.258097,
3.526361,3.583519,3.610918,3.637586,3.663562,3.663562,3.78419,
3.951244,4.060443,4.330733,3.135494,3.401197,3.610918,3.610918,
3.663562,3.78419,4.007333,4.025352,4.110874,4.158883,4.204693,
4.219508,4.248495,4.276666,4.290459,4.465908,3.091042,3.295837,
3.465736,3.526361,3.583519,3.828641,3.850148,4.060443,4.094345,
4.143135,4.143135,4.143135,4.234107,4.317488,4.394449,3.367296,
3.526361,3.7612,3.806662,3.828641,3.850148,3.850148,3.89182,
3.89182,3.912023,3.912023,3.912023,3.951244,3.951244,3.970292,
3.970292,4.007333,4.007333,4.025352,4.025352,4.043051,4.043051,
4.060443,4.077537,4.094345,4.110874,4.110874,4.127134,4.143135,
4.174387,4.174387,4.174387,4.174387,4.204693,4.204693,4.219508,
4.219508,4.248495,4.26268,4.290459,4.304065,4.330733,4.330733,
4.330733,4.356709,4.369448,4.382027,4.382027,4.418841,4.442651,
4.465908,4.488636,2.890372,3.465736,3.526361,3.526361,3.663562,
3.688879,3.850148,3.871201,3.912023,3.931826,3.931826,3.951244,
3.970292,3.988984,4.043051,4.060443,4.060443,4.060443,4.094345,
4.094345,4.110874,4.110874,4.127134,4.143135,4.158883,4.158883,
4.174387,4.189655,4.189655,4.204693,4.219508,4.234107,4.248495,
4.248495,4.248495,4.26268,4.26268,4.276666,4.276666,4.343805,
4.369448,4.477337,4.51086,4.521789,3.663562,3.951244,3.970292,
4.219508,4.343805,4.343805,4.356709,4.369448,4.394449,4.442651,
3.465736,3.806662,3.89182,3.970292,4.025352,4.143135,4.158883,
4.189655,4.204693,4.219508,4.234107,4.26268,4.356709),
bot=c(1,31,62,158,181),
counts=c(30, 31, 96, 23),
xpx = structure(.Data=c(180,715.1141,715.1141,2865.3546),.Dim=c(2,2)),
vb =
structure(.Data=c(0.6548306,-0.1634278,-0.1634278, 0.0411361),
.Dim=c(2,2)))

inits
list(thresh=c(0.7,2.3),B=c(-5,1.6))

Now, let's go multivariate, or at least bivariate, and consider both "transverse organization" and "surface texture."  Here is the R code using maximum likelihood estimation:

full.biv = function (it1 = 1, it2 = 2)
{
    library(mvtnorm)
    library(MASS)
    library(alabama)
    sto = Buck2
    sto = sto[complete.cases(sto), ]
    N = NROW(sto)
    age <- log(sto[, 1])
    t1 = sto[, 2]
    probit1 = polr(factor(t1) ~ age, method = "probit")
    beta1 = probit1$coeff
    alpha1 = probit1$zeta
    NS1 = NROW(alpha1) + 1
    t2 = sto[, 3]
    probit2 = polr(factor(t2) ~ age, method = "probit")
    beta2 = probit2$coeff
    alpha2 = probit2$zeta
    NS2 = NROW(alpha2) + 1

    lnlk <- function(xpar) {
        alpha1 = xpar[1:3]
        beta1 = xpar[4]
        alpha2 = xpar[5:8]
        beta2 = xpar[9]
        L <- rep(0, 2)
        R <- rep(0, 2)
        corr <- matrix(c(1, xpar[10], xpar[10], 1), nc = 2)
        LLK <- 0
        for (i in 1:N) {
            j <- sto[i, 2]
            if (j == 1) {
                L[1] <- -Inf
                R[1] <- alpha1[1] - beta1 * age[i]
            }
            if (j > 1 & j < NS1) {
                L[1] <- alpha1[j - 1] - beta1 * age[i]
                R[1] <- alpha1[j] - beta1 * age[i]
            }
            if (j >= NS1) {
                L[1] <- alpha1[NS1 - 1] - beta1 * age[i]
                R[1] <- Inf
            }
            j <- sto[i, 3]
            if (j == 1) {
                L[2] <- -Inf
                R[2] <- alpha2[1] - beta2 * age[i]
            }
            if (j > 1 & j < NS2) {
                L[2] <- alpha2[j - 1] - beta2 * age[i]
                R[2] <- alpha2[j] - beta2 * age[i]
            }
            if (j >= NS2) {
                L[2] <- alpha2[NS2 - 1] - beta2 * age[i]
                R[2] <- Inf
            }
            if (L[1] > R[1]) {
                L[1] <- R[1]
            }
            if (L[2] > R[2]) {
                L[2] <- R[2]
            }
            LLK <- LLK + log(pmvnorm(lower = L, upper = R, corr = corr)[1])
        }
        cat(LLK,'\r')
        flush.console()
        return(-LLK)
    }

    hin=function(xx){
    h=0
#   thresholds are ordered
    for(i in 2:3){
      h[i-1]=xx[i]-xx[i-1]
    }
    h[3]=xx[4]
#   thresholds are ordered
    for(i in 6:8){
      h[i-2]=xx[i]-xx[i-1]
    }
#   slope must be positive
    h[7]=xx[9]
#   constrain correlation between -1 and 1
    h[8] = 1 - abs(xx[10])  
    }


     start = c(alpha1,beta1,alpha2,beta2,0)
     names(start) = c(names(start)[-10],'r')
     cat('\n\n')
     return(constrOptim.nl(par=start,fn=lnlk,hin=hin))
}

And now let's look at that:

plot4.latent = function ()
{
library(ellipse)
threshes1=c(5.3466749,6.0256478,7.7539226)
slope1=1.6266498 
threshes2=c(5.6870261,7.0219120,9.1136953,10.9010736)
slope2=2.1385648 
r = 0.4549523
threshes = c(5.364422, 6.052758, 7.786678)
ages = c(15,85)
prec=solve(matrix(c(1,r,r,1),nc=2))
print(prec)

plot(ellipse(r,centre=c(slope1*log(ages[1]),slope2*log(ages[1])),level=.1),type='l',
   xlim=c(2,10),ylim=c(3,12),xlab='Transverse Organization',
   ylab='Microporosity',
   main='Isodensity ellipses for 15 and 85 year-olds',lty=2)
for(i in 1:9){
lines(ellipse(r,centre=c(slope1*log(ages[1]),slope2*log(ages[1])),level=i/10),lty=2)
}
for(i in 1:9){
lines(ellipse(r,centre=c(slope1*log(ages[2]),slope2*log(ages[2])),level=i/10),lty=2)
}
for (i in 1:3) lines(rep(threshes1[i],2),c(-100,100))
for (i in 1:4) lines(c(-100,100),rep(threshes2[i],2))

ages=c(1,300)
lines(slope1*log(ages),slope2*log(ages))
points(c(slope1*log(15),slope1*log(85)),c(slope2*log(15),slope2*log(85)),pch=19)
}


We can now go back to OpenBUGS to try to get posterior densities for age, but it is rather fraught with peril.  The code below is for someone in the first stage of "transverse organization" and the second stage of "microporosity."  The "fraught with peril" comment relates to generating starting values.

model
{
age ~ dunif(0,200)
log.age <- log(age)
pred[1] <- 1.6266498*log.age
pred[2] <- 2.1385648*log.age
y.star[1:2] ~ dmnorm(pred[1:2],prec[1:2,1:2]) I(thresh1[1:2], thresh2[2:3])

}

#data
list(thresh1=c(-100,5.3466749,6.0256478,7.7539226,100),
thresh2=c(-100,5.6870261,7.0219120,9.1136953,10.9010736,100),
prec=structure(.Data=c(1.261005, -0.573697,-0.573697, 1.261005),
.Dim=c(2,2)))

Probably simpler in the long run to handle this using "R."  The code below uses the Metropolis Sampler to move through trial ages and Gibbs Sampling to simulate the two latent traits:

Metrop2 = function (start.age=50,stage1 = 1, stage2 = 2, n.iters = 1000, jump.sd = 45, thin=5)
{
library(mvtnorm)
library(tmvtnorm)

thresh1 = c(-Inf, 5.3466749, 6.0256478, 7.7539226, Inf)
beta1 = 1.6266498 
thresh2 = c(-Inf, 5.6870261, 7.0219120, 9.1136953, 10.9010736, Inf)
beta2 = 2.1385648 
r = 0.4549523
age = start.age
sigma=matrix(c(1,r,r,1),nc=2)

sto=start.age

i.sto=0
acc=0
for(i in 1:(n.iters*thin)){
mu = c(beta1*log(age),beta2*log(age))
z.star = rtmvnorm(1,mean=mu,sigma,lower=c(thresh1[stage1],thresh2[stage2]),
  upper=c(thresh1[stage1+1],thresh2[stage2+1]),algorithm='gibbs')
trial.age=age + rnorm(1,0,jump.sd)
if(trial.age<=0) trial.age=age
p.stay = dmvnorm(z.star,mu,sigma)
mu = c(beta1*log(trial.age),beta2*log(trial.age))
p.move = dmvnorm(z.star,mu,sigma)
Ratio=p.move/p.stay
u=runif(1,0,1)
if(u<=Ratio) {
age=trial.age
acc=acc+1
}

if(i%%thin==0){
  i.sto=i.sto+1
  sto[i.sto]=age
  cat(i.sto,round(acc/i*100,2),'\r')
  flush.console()

}
}
cat('\n\n')
return(sto)
}


Assuming you stored the results, like junk=Metrop2() then you can find quantiles to get the lower 0.025 tail and the upper 0.975 tail.  You can also use emp.hpd which is from the TeachingDemos package to get highest posterior density regions.

emp.hpd=function(x, conf=0.95){
        conf <- min(conf, 1-conf)
        n <- length(x)
        nn <- round( n*conf )
        x <- sort(x)
        xx <- x[ (n-nn+1):n ] - x[1:nn]
        m <- min(xx)
        nnn <- which(xx==m)[1]
        return( c( x[ nnn ], x[ n-nn+nnn ] ) )
}

And just as we were able to use code from
Scott Lynch's (2007)
 book for the univariate cumulative probit fit by MCMC, we can use code from his book (pp. 297-299) to implement the bivariate.  This is going to be incredibly useful, because optimizing the likelihood in higher dimensional problems is not going to be a good option.

multiv.MCMC = function (N.saved.iters=1000,jump.sd=.1,x=log(Buck2[,1]),
     z=Buck2[,2:3],thin=5)
{
#R program for multivariate probit model

N = length(x)
x = cbind(rep(1,N),x)

# d is the dimensionality of the probit (from # of columns in "z"
# k is the number of explanatory variables (plus 1 for the intercept).  Ordinarily
# k=2, for age and intercept.
d=NCOL(z)
k=NCOL(x)
n.stages = apply(z,2,max)
max.stages=max(n.stages)

#create variables and starting values
zstar=matrix(0,nrow(z),d)
b=matrix(0,(d*k))
s=cs=diag(d)

tz=matrix(0,d,max.stages+1)

tz[,1]=-Inf; tz[,2]=0;
for(i in 1:d){
  if(n.stages[i]>2){
  for(j in 3:(n.stages[i])){
    tz[i,j]= qnorm(sum(z[,i]<=(j-1))/nrow(z), mean=-qnorm(sum(z[,i]==(j-2))/nrow(z),mean=0,sd=1), sd=1)
  }
  }
  tz[i,n.stages[i]+1]=Inf
}

ctz=tz
acc1=acc2=acctot=0
n.corrs=d*(d-1)/2
nc.results=sum(n.stages)+n.corrs
temp = rep(NA,sum(n.stages))
results=matrix(0,nc=nc.results,nr=N.saved.iters)


#begin Gibbs sampling
ip=0
acc=0
iter=0
for(i in 1:(thin*N.saved.iters)){

#draw latent data: one-iteration gibbs sampler for truncated mvn simulation

bb=matrix(b,2,d)
m=x%*%bb
for(j in 1:d)
{
mm = m[,j] + (zstar[,-j]-m[,-j])%*%solve(s[-j,-j])%*%s[j,-j]
ss = s[j,j]-t(s[j,-j])%*%(solve(s[-j,-j]))%*%s[j,-j]
zstar[,j]=qnorm(runif(nrow(z),
min=pnorm(tz[j,z[,j]],mm,sqrt(ss)),
max=pnorm(tz[j,(z[,j]+1)],mm,sqrt(ss))),mean=mm,sd=sqrt(ss))
}

# draw new thresholds from uniforms defined by max(zstar) for stage below
# and min(zstar) for stage above threshold.  Note that Cowles MH sampler would be
# needed for large reference samples (typically not the case that they are)

for(ii in 1:d){
  if(n.stages[ii]>2){
  for(jj in 3:(n.stages[ii])){
    tz[ii,jj]= runif(1,max(zstar[z[,ii]==(jj-1),ii]),min(zstar[z[,ii]==jj,ii]))
  }
  }
}

#draw b from mvn
vb=solve(solve(s)%x%(t(x)%*%x))
mn=vb%*%(as.vector(t(x)%*%zstar%*%t(solve(s))))
b=mn+t(rnorm((d*k),0,1)%*%chol(vb))


#use metropolis-hastings sampling to draw sigma
e=matrix((as.vector(zstar)-(diag(d)%x%x%*%b)),nrow(z),d)
v=t(e)%*%e

 for(j in 1:(d-1)){
 for(jj in (j+1):d){
   cs=s
   iter=iter+1
   like=-.5*(nrow(z)+n.corrs+1)*log(det(s))-.5*sum(diag(v%*%solve(s)))
   cs[j,jj]=cs[jj,j]=cs[j,jj]+rnorm(1,mean=0,sd=jump.sd)
   if(abs(cs[j,jj])<1){
       det.cs = det(cs)
       if(det.cs<=0) clike=-Inf
   else {clike=-.5*(nrow(z)+n.corrs+1)*log(det.cs)-.5*sum(diag(v%*%solve(cs)))}
       if((clike-like)>log(runif(1,0,1))){
       acc=acc+1
       s=cs
   }
}
}
}

if(i%%thin==0){
cat(i/thin,' ',round(acc/iter*100,2),'\r')
flush.console()
ip=ip+1
b.vec=as.vector(b)
ip2 = 0
for(i.trait in 1:d){
   ip2=ip2+1
   temp[ip2] = -b.vec[2*i.trait-1]
   if(n.stages[i.trait]>2){
   for(j.stage in 3:(n.stages[i.trait])){
     ip2=ip2+1
     temp[ip2] = tz[i.trait,j.stage]-b[2*i.trait-1]
}
}
 ip2=ip2+1
 temp[ip2] = b.vec[i.trait*2]
}

results[ip,]=c(temp,s[lower.tri(s)])

}
}
cat('\n\n')

find.it = matrix(NA,nr=d,nc=3)

ip2 = 0
for(i.trait in 1:d){
   find.it[i.trait,1]=i.trait
   ip2=ip2+1
   find.it[i.trait,2]=ip2
   if(n.stages[i.trait]>2){
   for(j.stage in 3:(n.stages[i.trait])){
     ip2=ip2+1
   }
   }
 ip2=ip2+1
 find.it[i.trait,3]=ip2
}
ip2=ip2+1
colnames(find.it)=c('Trait #','start','stop')


print(find.it)
cat(paste('Corr matrix starts at:',ip2,'and ends at:',ip2+n.corrs-1,'\n\n'))

return(results)
}


Now let's go full bore and look at all five traits: time to go quintivariate!

quint.var = multiv.MCMC(x=log(Buck2[,1]),z=Buck2[,2:6])

Which will store the results in quint.var and print the following secret decoder ring to the screen:

     Trait # start stop
[1,]       1     1    4
[2,]       2     5    9
[3,]       3    10   12
[4,]       4    13   15
[5,]       5    16   18
Corr matrix starts at: 19 and ends at: 28

And to summarize results:

mu.quint = apply(quint.var,2,mean)

Can use the following to get the correlation matrix:

to.symm = function(x)
{
k=length(x)
n.dim=(sqrt(8*k+1)+1)/2
sym.mat=diag(n.dim)
ip=0
for(i in 1:(n.dim-1)){
for(j in (i+1):n.dim){
  ip=ip+1
  sym.mat[i,j]=sym.mat[j,i]=x[ip]
}
}
return(sym.mat)
}

So to.symm(mu.quint[19:28]) will return the correlation matrix.  Now let's do some examples to see how this all might work.  To do this we'll want to "cheat" and see what stages individuals at particular ages "should" be in using the following script:

pred.quint = function (age=20)
{
thresh1 = c(-100, 5.251512,  5.920167,  7.616988, 100)
thresh2 = c(-100, 5.724091,  7.074513,  9.192661, 10.997112, 100)
thresh3 = c(-100, 5.732676,  6.773199, 100)
thresh4 = c(-100, 11.536682, 12.712156, 100)
thresh5 = c(-100, 3.346391,  5.378389, 100)

beta=c(1.597855, 2.157894, 1.901371, 2.861199, 1.050545) 
ln.age = log(age)
pred=beta*ln.age

traits=0
traits[1] = which(hist(pred[1],breaks=thresh1,plot=F)$counts==1)
traits[2] = which(hist(pred[2],breaks=thresh2,plot=F)$counts==1)
traits[3] = which(hist(pred[3],breaks=thresh3,plot=F)$counts==1)
traits[4] = which(hist(pred[4],breaks=thresh4,plot=F)$counts==1)
traits[5] = which(hist(pred[5],breaks=thresh5,plot=F)$counts==1)
return(traits)
}

And run this script at some different ages:

> pred.quint(20)
[1] 1 2 1 1 1
> pred.quint(30)
[1] 2 3 2 1 2
> pred.quint(40)
[1] 2 3 3 1 2
> pred.quint(50)
[1] 3 3 3 1 2
> pred.quint(90)
[1] 3 4 3 3 2

Now we can use numerical integration because at six total dimensions (five traits plus age) this is still manageable.  Here's the code for doing that:

plot.posterior.quint = function (stages=c(1,2,1,1,1),right=120,individ.traits=T)
{

library(mvtnorm)
labs = c('TO','ST','MI','MA','AC')

threshes1=c(-100, 5.27832576, 5.90195040, 7.60895505, 100)
threshes2=c(-100, 5.50589931,  6.62438454,  8.62159559, 10.42441545,100)
threshes3=c(-100, 5.67513532, 6.65208747, 100)
threshes4=c(-100, 11.26860492, 12.41364617, 100)
threshes5=c(-100, 3.42618056, 5.49043687, 100)

slopes=c(1.59691443, 2.02719296, 1.87308859, 2.79418152, 1.07614601)

r = structure(c(1, 0.478162890926461, 0.136055434769487, 0.391950432352597,
0.0484305398368354, 0.478162890926461, 1, 0.203720014696409,
0.515789775475512, 0.244239468835812, 0.136055434769487, 0.203720014696409,
1, 0.342211351267706, -0.118230553153396, 0.391950432352597,
0.515789775475512, 0.342211351267706, 1, 0.117967213208456, 0.0484305398368354,
0.244239468835812, -0.118230553153396, 0.117967213208456, 1), .Dim = c(5L,
5L))


threshes1 = c(-Inf,threshes1[-c(1,5)],Inf)
threshes2 = c(-Inf,threshes2[-c(1,6)],Inf)
threshes3 = c(-Inf,threshes3[-c(1,4)],Inf)
threshes4 = c(-Inf,threshes4[-c(1,4)],Inf)
threshes5 = c(-Inf,threshes5[-c(1,4)],Inf)


bot = c(threshes1[stages[1]],threshes2[stages[2]],threshes3[stages[3]],threshes4[stages[4]],threshes5[stages[5]])
top = c(threshes1[stages[1]+1],threshes2[stages[2]+1],threshes3[stages[3]+1],threshes4[stages[4]+1],threshes5[stages[5]+1])

################### CORRELATION #####################################

get.densr = function(x){
  return(pmvnorm(mean=slopes*log(x),lower=bot,upper=top,corr=r)[1])
}
denomr = integrate(Vectorize(get.densr),0.01,150)$val
get.postr = function(x){
  return(pmvnorm(mean=slopes*log(x),lower=bot,upper=top,corr=r)[1]/denomr)
}


################## NO CORRELATION ##################################

get.dens0 = function(x){
  return(pmvnorm(mean=slopes*log(x),lower=bot,upper=top,corr=diag(5))[1])
}
denom0 = integrate(Vectorize(get.dens0),0.01,150)$val
get.post0 = function(x){
  return(pmvnorm(mean=slopes*log(x),lower=bot,upper=top,corr=diag(5))[1]/denom0)
}
age0=optimize(get.post0,int=c(0.01,200),maximum=T)$max
max.pdf = get.post0(age0)

####################################################################

p=0
for(i in 1:right){
p[i] = get.postr(i)
}
title=paste(labs[1],' = ',stages[1],', ',labs[2],' = ',stages[2],', ',
            labs[3],' = ',stages[3],', ',labs[4],' = ',stages[4],', ',
            labs[5],' = ',stages[5], sep='')
plot(1:right,p,type='l',xlab='Age',ylab='Density',lwd=3,col='dark green',ylim=c(0,max.pdf),
   main=title)

for(i in 1:right){
p[i] = get.post0(i)
}

lines(1:right,p,col='red')
abline(h=0)
legend(2/3*right,8/9*max.pdf,
legend=c('independent','dependent'),
   bty='n',lwd=c(1,3),col=c('red','dark green'),
   cex=1.25,y.intersp=1.5)

if(individ.traits==F) return(cat('All done\n\n'))

get.dens1 = function(x,iwhich){
  return(pmvnorm(mean=slopes[iwhich]*log(x),lower=bot[iwhich],upper=top[iwhich],sig=1)[1])
}

get.post1 = function(x,iwhich){
  return(pmvnorm(mean=slopes[iwhich]*log(x),lower=bot[iwhich],upper=top[iwhich],sig=1)[1]/denom1)
}

for(iwhich in 1:5){
denom1 = integrate(Vectorize(get.dens1,'x'),0.01,150,iwhich=iwhich)$val
for(i in 1:right){
p[i] = get.post1(i,iwhich)
}
lines(1:right,p,lty=2)
left = which.max(p)
height = p[left]
text(left,height,labs[iwhich],adj=c(0.5,0))
}
}


Run the script with the defaults, and then try plot.posterior.quint(stages=c(1,2,3,2,2))
That second example is where the stages are giving disparate information.  Notice how the model where we assumed conditional independence (when we know this is not actually the case) gives us a "tighter" hpd than from any of the individual traits.  That be spurious.  The dependent case (which we know is what is actually happening) properly broadens the hpd to reflect the uncertainty that comes from having disparate skeletal information.

Now we could  build in hpd calculation to the above script, but it is a bit of a pain.  So instead let's go to Metropolis again:

Metrop3 = function (start.age=50,stage=c(1,2,3,2,2), n.iters = 1200, jump.sd = 45,thin=5)
{
library(mvtnorm)
library(tmvtnorm)


thresh=matrix(NA,5,6)
thresh[1,]=c(-Inf, 5.27832576, 5.90195040, 7.60895505, Inf, NA)
thresh[2,]=c(-Inf, 5.50589931,  6.62438454,  8.62159559, 10.42441545,Inf)
thresh[3,]=c(-Inf, 5.67513532, 6.65208747, Inf, NA, NA)
thresh[4,]=c(-Inf, 11.26860492, 12.41364617, Inf, NA, NA)
thresh[5,]=c(-Inf, 3.42618056, 5.49043687, Inf, NA, NA)

beta=c(1.59691443, 2.02719296, 1.87308859, 2.79418152, 1.07614601)

sigma = structure(c(1, 0.478162890926461, 0.136055434769487, 0.391950432352597,
0.0484305398368354, 0.478162890926461, 1, 0.203720014696409,
0.515789775475512, 0.244239468835812, 0.136055434769487, 0.203720014696409,
1, 0.342211351267706, -0.118230553153396, 0.391950432352597,
0.515789775475512, 0.342211351267706, 1, 0.117967213208456, 0.0484305398368354,
0.244239468835812, -0.118230553153396, 0.117967213208456, 1), .Dim = c(5L,
5L))

lo=-Inf
hi=Inf

for(i in 1:5){
  lo[i]=thresh[i,stage[i]]
  hi[i]=thresh[i,stage[i]+1]
}
age=start.age
sto=age
i.sto=0
acc=0
for(i in 1:(n.iters*thin)){
mu = beta*log(age)
z.star = rtmvnorm(1,mean=mu,sigma,lower=lo,
  upper = hi,algorithm='gibbs')
trial.age=age + rnorm(1,0,jump.sd)
if(trial.age<=0) trial.age=age
p.stay = dmvnorm(z.star,mu,sigma)
mu = beta*log(trial.age)
p.move = dmvnorm(z.star,mu,sigma)
Ratio=p.move/p.stay
u=runif(1,0,1)
if(u<=Ratio) {
age=trial.age
acc=acc+1
}

if(i%%thin==0){
  i.sto=i.sto+1
  sto[i.sto]=age
  cat(i.sto,round(acc/i*100,2),'\r')
  flush.console()

}
}
cat('\n\n')
return(sto)
}


Now:

plot.posterior.quint(stages=c(1,2,3,2,2),ind=F)
hist(Metrop3(n=1000),add=T,prob=T,br=80)

Wrap-up:

In attempting to divide and conquer we looked at (multivariate) continuous traits and (multivariate) ordinal categorical traits.  What would one do if dealing with both sets of traits at the same time?  Maybe you have tooth root transparency data (which is continuous)  and "phase" data.   MCMC is going to be the way to go.  Continuous traits can be incorporated by realizing that their residual correlation with "phase" data is modeled as a polyserial correlation.  See the R package polycor for the polyserial correlation.



References