Getting Some Data

It is debatable what is the one most terrifying aspect when first using "R," but importing data has to be pretty high on the list.  The good news is that "R" reads *.csv (comma-delimited files) quite easily. So if you keep your data in Excel files, then just save them as comma delimited and read them in using "read.csv," as in the following:


refbones=read.csv('refbones.csv')

Provided your *.csv file is in the same directory as your R workspace, all will be good.

But let's make life simpler.  Copy and paste the highlighted text to your RGui and hit enter.

refbones=structure(list(Sample=structure(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2),.Label=c("F&K","M&D39"),class="factor"),ID=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,
35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,
87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,
133,134,135,136,137,138,505,505,505,505,501,501,501,501,500,500,500,497,497,497,495,495,495,494,494,494,491,491,491,491,488,488,488,488,488,485,485,485,484,484,484,483,483,483,
462,462,462,462,462,480,480,480,478,478,478,478,478,474,474,474,473,473,473,473,473,471,471,471,467,467,467,463,463,463,461,461,461,460,460,460,459,459,459,458,458,458,458,452,
452,452,451,451,451,447,447,447,447,444,444,444,444,438,438,438,437,437,437,437,437,436,436,436,434,434,434,434,434,434,427,427,427,427,427,426,426,426,426,426,425,425,425,423,
423,423,421,421,421,421,418,418,418,418,418,412,412,412,412,407,407,407,398,398,398,391,391,391),Age=c(12,12,14,14,14,16,16,16,16,16,16,16,16,16,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,20,20,20,20,20,20,20,20,20,20,20,20,20,22,22,22,22,22,22,22,22,22,22,22,24,24,24,24,24,24,24,24,24,24,24,24,26,26,26,26,26,26,26,26,
26,26,26,26,28,28,28,28,28,28,28,28,28,28,28,28,30,30,30,30,30,30,30,30,30,30,30,30,32,32,32,32,32,32,32,32,34,34,34,34,34,34,34,36,36,36,36,36,38,38,38,38,
38,38,38,40,40,40,40,40,40,40,40,40,40,40,46,51,54,41,46,52,55,40,46,52,40,46,52,40,46,52,41,46,52,41,46,52,56,40,46,52,58,62,40,46,52,41,46,52,40,46,52,41,
46,52,58,64,41,46,52,40,46,52,58,64,40,46,52,41,46,52,58,64,40,46,52,41,46,52,40,46,52,41,46,52,40,46,52,41,46,52,40,46,52,58,41,46,52,41,47,52,40,46,52,58,
41,46,52,58,42,46,52,41,46,52,58,64,40,46,66,40,46,51,58,64,70,40,46,52,58,64,41,46,58,64,70,41,46,52,40,46,53,46,52,58,64,41,46,52,58,64,41,46,52,58,46,52,
58,48,52,58,53,58,64),Fem=c(7,10,11.9,11.5,13.8,18,19,22.5,24,22.5,20,20,18.8,21.8,25.5,27,25.9,26.5,28.2,27.3,29,27,24,24.2,24.6,26,27,26,29,29,32,32,32,32.5,32,31,32.8,36.2,33.5,36.1,32,33.5,32.6,
36.6,35.5,36.5,35,33,35.6,34.7,39.7,38,33.8,37.5,37.5,44.1,37.2,42.9,41,44.9,38.4,45,39.5,38,38,40.8,40.5,42,39,41,43,45.4,46.2,45.3,41,46,38.5,46.5,46,45,44.5,44.5,48,49,47,47,49,48,49,48,46,46.5,45,47.5,
50,49,47,49,52.6,54,50,54.3,56.3,57,59,56,55,56.1,52.5,66,57,60.9,60,58.2,60.4,59.2,60,61,67.5,61.4,62.7,64,66,67,70.5,73,72.3,73.5,69,72.1,71,79,74,75.5,75.8,78.7,75.7,76.7,80,88,96,100,75,84.5,93,
98,76,85,95,79,88,98.5,79,89.5,99,77,84.5,94,81,89,98,104,81.5,90.5,101.5,110,115,79,88,96.5,84,92.5,100,73,82.5,92.5,73,77.5,91,101,109,77,85,94,76,87.5,96,103,110,81,91.5,99.5,84,92,103,110.5,118,
78,88,98,79,87,97,77,85.5,94,78,88,98,73,82,92,72,83,92,75,83,88,94,80,88,98,77,88,97.5,66,75,84,91,74,80,88,97,81,88,98,74,85,94.5,101,109,75,88,114,79,90,97,107,115,122,82,94,100,107,116,73,
81,96,102.5,109,82,90,100,78,88,97,89,97,106,113.5,78,87,96,103,110,75,83,92,100,87,94,101,90,97,105,85,93,100)),.Names=c("Sample","ID","Age","Fem"),class="data.frame",row.names=c(NA,-288L))

# The End!

Now you have it!


ls()
[1] "refbones"

head(refbones)

  Sample ID Age  Fem
1    F&K  1  12  7.0
2    F&K  2  12 10.0
3    F&K  3  14 11.9
4    F&K  4  14 11.5
5    F&K  5  14 13.8
6    F&K  6  16 18.0


OK, all's good.  Let's try to get a handle on what we have here:

 table(refbones[,1])

  F&K M&D39
  138   150

So this is data on the 138 cases from 
Fazekas and Kósa (1978) and 150 girls from Maresh and Deming (1939).  OK, except that I lied about that being 150 girls.  This is semi-longitudinal data, so it is actually:

length(unique(refbones$ID[-(1:138)]))

[1] 41

Forty-one girls.   OK, that looked kinda' nasty, so here's the explanation on what just happened.  refbones$ID is going to just pull out the "ID" variable.  The first 138 cases are from Fazekas and Kosa, so the -(1:138) drops those cases, "unique" is, well, unique, and length is length.


Now, let the wild rumpus begin!

Paste the following into your Rgui:

FemRef.mfp = function ()
{
# uncomment line below if you actually want to run the mfp
# library(mfp)
set.seed(1234)
RF = data.frame(cbind(refbones[,3]*7,refbones[,4]))
colnames(RF)=c('Age','fem')

# Comment out the line below and uncomment the line below that to run
# the mfp
  sto = lm(fem~I(sqrt(Age)),data=RF)
# sto=mfp(formula = fem ~ fp(Age,df=2), data = RF)

betas=as.vector(sto$coeff)

mu = function(t) {
  age=t
  return(betas[1]+betas[2]*sqrt(age))
}

absResid=abs(RF$fem-mu(RF$Age))
# uncomment line below, then comment out the line below that if you want to use mfp
# betas2=as.vector(mfp(formula = absResid~fp(RF$Age,df=1))$coef)
betas2 = lm(absResid ~ RF$Age)$coeff
SD = function(t) {(betas2[1]+betas2[2]*t)*sqrt(pi/2)}

age=seq(10*7,70*7,.1)
est.mu = mu(age)
est.sd = SD(age)

plot(age,est.mu+1.96*est.sd,type='l',lty=2,xlab='Age (LMP days)',
    ylab='Femur Length (mm.)',ylim=c(0,120))
text(75,109,substitute(paste(mu[y],' = ',b1,' + ',b2,sqrt(x)),
   list(b1=round(betas[1],4),b2=signif(betas[2],4))),adj=0,cex=1.25)
text(75,99,substitute(paste(sigma[y],' = ',b1,' + ',b2,x),
   list(b1=signif(betas2[1]*sqrt(pi/2),5),b2=signif(betas2[2]*sqrt(pi/2),4))),adj=0,cex=1.25)
lines(age,est.mu)
lines(age,est.mu-1.96*est.sd,lty=2)

dat = cbind(jitter(RF$Age,factor=1),RF$fem)
points(dat,cex=.5)

est.mu = mu(RF$Age)
est.sd = SD(RF$Age)

test.it=cbind(est.mu-1.96*est.sd,RF[,2],est.mu+1.96*est.sd)
perc=round(sum(test.it[,2]>=test.it[,1]&test.it[,2]<=test.it[,3])/NROW(RF)*100,2)
cat('\n\n')
cat(noquote(paste(sep='','est.mu +/- 1.96*est.sd femur length conditioned on age contains ',perc,'% of data','\n')))
cat('\n\n')
}


And now the wild rumpus really begins:


FemRef.mfp()


The regression technique used here was a  "fractional polynomial" and regression of absolute residuals.  See Altman (1993).  Now make a function to "invert" this, so that given a femur length we can read the graph "backwards" to get an age in days LMP:

est.age=function (fem=81.84) return(((fem+73.8884)/8.829)^2)

And now we can find the age in days LMP for someone with a femur length of 81.84 mm:

est.age()

[1] 311.109

This was for the youngest individual from Kyra Stull's dataset.  The age given there is 0.087671 years, so we need to subtract 40*7 (that 40 weeks for gestational age times 7 days per week, 280 days gestation) and divide by 365 days (for a year)::

(est.age()-40*7)/365

[1] 0.0852302


 OK, so that's great that we got so close to the actual age, but a point estimate is not really going to cut it.  So let's think about this a bit more.   Our lovely graph shows us how to predict the mean and standard deviation of femoral length given known age, so the distribution of femur length given known age.  But we want the reverse, or the distribution of age given a known femur length.  This screams Bayes' Theorem. So let's start down that path.

First we'll want a function for the likelihood:

Likelihood = function (age=311.109,fem=81.84)
{
 mu.y = -73.8884 + 8.829*sqrt(age)
 sd.y = 1.6216 + 0.009508*age
 return(dnorm(fem,mu.y,sd.y))
}

And now we can try this in a very rough example:

plot(250:375,Likelihood(250:375),type='l',xlab='Age',ylab='Likelihood')
lines(rep(est.age(81.84),2),c(0,2))
abline(v=280+0.087671*365,lty=2,lwd=2)

But that's not "normalized" (in other words, it doesn't integrate to 1.0).  So we need to integrate across age and then divide by the integral.  The following code does this, and a lot more.

Posterior = function (fem = 81.84)
{

# Integrate across likelihood
denom = integrate(Likelihood,lower=0,upper=600,fem=fem)$val

# Posterior density is likelihood at age x divided by integral across x
post = function(x){
   return(Likelihood(x,fem=fem)/denom)
}

# Get age estimate from function and posterior density at that age
age = est.age(fem)
max.pdf = post(age)

# Following three functions are for finding left and right tails of posterior
bound = function(x,xx) return(post(x)-xx)
lo = function(xx) uniroot(bound,lower=0,upper=age,tol=.Machine$double.eps,xx=xx)$root
hi = function(xx) uniroot(bound,lower=age,upper=600,tol=.Machine$double.eps,xx=xx)$root

# For finding specific value of posterior density that contains "want.hdr" HDR
how.hi = function(xx, want.hdr){
bot = lo(xx); top = hi(xx)
 return(integrate(post,bot,top,abs.tol=.Machine$double.eps)$val - want.hdr)
}

hdr.height = uniroot(how.hi,lower=max.pdf/10e4,upper=max.pdf,want.hdr=0.999,tol=.Machine$double.eps)$root
bot = lo(hdr.height); top = hi(hdr.height)
hdr = c(bot,top)

plot(0:600,post(0:600),type='l',xlab='Age (LMP Days)',ylab='Posterior Density',xlim=hdr)
lines(c(0,600),rep(0,2))

# 95% interval using QUANTILES
left = function(x) integrate(post,lower=0,upper=x,abs.tol=.Machine$double.eps)$val-0.025
med = function(x) integrate(post,lower=0,upper=x,abs.tol=.Machine$double.eps)$val-0.5
right = function(x) integrate(post,lower=0,upper=x,abs.tol=.Machine$double.eps)$val-0.975
bot.pi=uniroot(left,lower=0,upper=age,tol=.Machine$double.eps)$root
top.pi=uniroot(right,lower=age,upper=600,tol=.Machine$double.eps)$root
med.est = uniroot(med,lower=bot.pi,upper=top.pi,tol=.Machine$double.eps)$root
PI = c(bot.pi,top.pi)
for(i in 1:2) lines(rep(PI[i],2),c(0,post(PI[i])))
horizon = max(post(bot.pi),post(top.pi))
lines(c(0,1000),rep(horizon,2),lty=2,lwd=2)
polygon(c(PI[1],seq(PI[1],PI[2],.5),rep(PI[2],2)),
      c(0,post(c(seq(PI[1],PI[2],.5),PI[2])),0),density=10)

cat('\n\n')
cat(noquote(paste(sep=' ','2.5%, 50%, and 97.5% at',round(PI[1],1),round(med.est,1),round(PI[2],1),'\n\n')))
cat('This is the quantile-based probability interval\n\n')
junk=readline('Hit anything except your neighbor: ')

plot(0:600,post(0:600),type='l',xlab='Age (LMP Days)',ylab='Posterior Density',xlim=hdr)
lines(c(0,600),rep(0,2))

# Previous was 99.9% HPD for plotting.  Now find 95% HIGHEST POSTERIOR DENSITY
hdr.height = uniroot(how.hi,lower=max.pdf/10e4,upper=max.pdf,want.hdr=0.95,tol=.Machine$double.eps)$root
bot = lo(hdr.height); top = hi(hdr.height)
hdr = c(bot,top)

lines(c(0,600),rep(hdr.height,2),lty=2,lwd=2)
lines(rep(bot,2),c(0,hdr.height))
lines(rep(top,2),c(0,hdr.height))

polygon(c(hdr[1],seq(hdr[1],hdr[2],.5),rep(hdr[2],2)),
      c(0,post(c(seq(hdr[1],hdr[2],.5),hdr[2])),0),density=10)

l.area = round(integrate(post,lower=0,upper=bot,abs.tol=.Machine$double.eps)$val*100,4)
r.area = round(integrate(post,lower=top,upper=600,abs.tol=.Machine$double.eps)$val*100,4)
cat('\n')
cat(noquote(paste(sep=' ','Left, Mode, and Right at',round(hdr[1],1),round(age,1),round(hdr[2],1),'\n\n')))
cat(noquote(paste(sep='','Tail to left of 95% HDR contains ',l.area,'% of posterior density','\n')))
cat(noquote(paste(sep='','Tail to right of 95% HDR contains ',r.area,'% of posterior density','\n')))
}

Posterior()



Maybe you get no joy from calculus, so let's move over to OpenBUGS instead:

1. Start OpenBUGS and you'll get the following:

Opening menu

2.  Then open a new file:

OpenBUGS2

3.  Copy and paste the boxed material below:

model
{
   y <- 81.84
   y.mu <- -73.8884 + 8.829*sqrt(x)
# Note: In Bayesian lingo you use "precision" rather
# than variance (or standard deviation). It is
# just the inverse of variance.
   y.prec <- 1/pow(1.6216 + 0.009508*x,2)
   y ~ dnorm(y.mu, y.prec)
   x ~ dunif(0,600)
}

WinBUGS1

4.  Mark the "model" text then go to "model" and then "specification"

WinBUGS2



5.  And do "check model" (assuming you have highlighted the block):

WinBUGS3


6.  At the very bottom of the screen in the bottom left corner you should get some good news.

WinBUGS4

7.  Now do "compile" and then "gen inits" from the menu (hey, no data to load) and you are good to go.

8.  Now go to "inference" and then "samples."  In "node" type "x" (without quotes) and hit set.  Then do, say, 5,000 updates (that's under "model").  Then click on "stats," "trace," "density" and "auto cor" to get the goods:

WinBUGS5

You can save the chain using "coda" and then get the 95% HPD in R:

Age.Chain=read.table('AgeChain.txt')[,-1]
library(HDInterval)
hdi(Age.Chain)


OpenBUGS used a very efficient Markov Chain Monte Carlo (MCMC) method called "slice" sampling.  In R we can use an inefficient method called  Metropolis sampling within "R."  

Metrop = function (burn.in=1000, jump.sd=25,y=81.84)
{
# Comment out the line below if you want a random seed
set.seed(12345)
x.curr=100

mu.y=function (x) -73.8884+8.829*sqrt(x)
sd.y=function (x) 1.6216+0.009508*x

# Called "One.Step" because this is one step in the Metropolis Sampler
One.Step = function(x.curr){
curr.dens = dnorm(y,mu.y(x.curr),sd.y(x.curr))
x.trial = x.curr + rnorm(1,0,jump.sd)
# Can't take square root of a negative, so return current value
if(x.trial<=0) return(x.curr)
trial.dens = dnorm(y,mu.y(x.trial),sd.y(x.trial))
R = trial.dens/curr.dens
if(R>=1) return(x.trial)
else{
  U = runif(1)
  if(U<=R) return(x.trial)
}
return(x.curr)
}

# 1000 iterations for burn-in
for(i in 1:burn.in){
 x.curr=One.Step(x.curr)
}

accept=0
# And now 5000 iterates
x = x.curr
for(i in 2:5000){
 sto = x.curr
 x.curr=One.Step(x.curr)
 if(x.curr!=sto) accept=accept+1
 x[i]=x.curr
}

cat('Acceptance rate = ',accept/5000,'\n\n')


return(x)
}


There are lots of "fun" things we can do with this, such as:

 plot(Metrop(burn=0),type='l') # Shows the effect of not having any burn-in

plot(Metrop(jump=1),type='l')  # Shows the effect of having a "jump" that is too small (lots o' baby-steps)

acf(Metrop(jump=1)) # Shows the disastrous effect on the autocorrelation function

acf(Metrop()) # As versus a reasonable "jump" (the default value is =25)

quantile(Metrop(),c(0.025,0.5,0.975)) # Our 2.5%, 50%, and 97.5% quantiles

acf(Metrop()[seq(1,5000,5)]) # thinning the series to get rid of autocorrelation




References