# Demo code for SPIDA 2005 lecture in Toronto# Start with the osteoporosis example by reading in the
# data.  You might need to add a path to this command on your
# system.  I put the data file in my 'working directory.'  osteo <- read.table("osteo.txt", header=T)# --- pull out the hip data (lots of ways to get at it)  names(osteo)  osteo[,"hip"]  attach(osteo)  hip# --- kernel density estimate  plot(density(hip))  points (hip,rep(0,length(hip)))# --- confidence interval for the average hip score  summary(hip)  mean(hip); sd(hip)  mean(hip) + 2 * c(-1,1) * sd(hip)/sqrt(length(hip))
# Bootstrap standard error, without simulation

  (sqrt(63/64) * sd(hip)) / sqrt(64)

# --- Bootstrap version of confidence interval for average  avg.bs <- c()  for(b in 1:2000) {    yStar <- sample(hip,64,replace=T);    avg.bs <- c(avg.bs, mean(yStar)) }  sd(avg.bs)                # this is the simulated BS std error
     plot(density(avg.bs))  hist(avg.bs, breaks=25)   # BS sampling dist  quantile(avg.bs,c(.025,.975))  # pctile interval###  ------------------------------------------------------------------###  Bootstrap test###  ------------------------------------------------------------------# --- Osteoporosis data: CI for difference in averages    hip.no <- hip[ estrogen=="no" ];  n.no  <- length(hip.no)  hip.yes <- hip[ estrogen=="yes"]; n.yes <- length(hip.yes)  # t tests and CI, without and with equal variance assumption  t.test(hip.no, hip.yes)  t.test(hip.no, hip.yes, var.equal=T)  # --- Center the two groups under null at grand mean  hip.no.0 <- hip.no - mean(hip.no) + mean(hip)  hip.yes.0 <- hip.yes - mean(hip.yes) + mean(hip)  # --- Sample from each, compare  diffs <- c()  for (i in 1:1000) {
  	mean.no  <- mean(sample(hip.no.0, n.no, replace=T))
  	mean.yes <- mean(sample(hip.yes.0,n.yes,replace=T))    diffs <- c(diffs,  mean.no - mean.yes) }  
  hist(diffs, breaks=20, probability=T); lines(density(diffs))    length( diffs[abs(diffs)>0.352] )  # count number larger than obs, BS p-value###  ------------------------------------------------------------------###  Trimmed mean###  ------------------------------------------------------------------  trimmed.mean <- function(x) {    s <- sort(x); n <- length(x);    trim <- ceiling(.10 * n);  # 10% off each tail of data    mean(s[trim:(n-trim)])  }  avg.bs <- med.bs <- trim.bs <- c()  for (b in 1:2000) {    hip.bs <- sample(hip,length(hip),replace=T)    avg.bs <- c(avg.bs,mean(hip.bs))    med.bs <- c(med.bs,median(hip.bs))    trim.bs <- c(trim.bs,trimmed.mean(hip.bs))  }  sd(trim.bs)  quantile(trim.bs, c(.025,.975))  hist(trim.bs,breaks=20,probability=T); lines(density(trim.bs))### Contaminated data with outlier  contaminated.hip <- c(hip,8);  hist(contaminated.hip, breaks=20)  avg.bs <- med.bs <- trim.bs <- c()  for (b in 1:200) {    hip.bs <- sample(contaminated.hip,length(contaminated.hip),replace=T)    avg.bs <- c(avg.bs,mean(hip.bs))    med.bs <- c(med.bs,median(hip.bs))    trim.bs <- c(trim.bs,trimmed.mean(hip.bs))  }  sd(trim.bs)  quantile(trim.bs, c(.025,.975))  hist(trim.bs,breaks=20,probability=T); lines(density(trim.bs))  # get percentile interval for each  pct.interval <- function(x) {    quantile(x,c(.025,.975)) }  apply(cbind(avg.bs,trim.bs,med.bs),2,  pct.interval)  # see relationship among estimators (smaller B if saving plot)  pairs(cbind(avg.bs, trim.bs, med.bs))###  ------------------------------------------------------------------###  Ratio of averages, earnings data###  ------------------------------------------------------------------  # read data file  earn <- read.table("earnings.txt", header=T)  summary(earn)  attach(earn)  earn.ne <- earnings[region=="NE"]  earn.mw <- earnings[region=="Midwest"]    # Terms for taylor series  m1 <- mean(earn.ne);    m2 <- mean(earn.mw)  n1 <- length(earn.ne) ; n2 <- length(earn.mw)  s1 <- sd(earn.ne)  ;    s2 <- sd(earn.mw)  m1/m2  sqrt(s2^2/(n1 * m2^2) + s2^2 * m1^2 / (n2 * m2^4))  # bootstrap ratio of means  ratio <- c()  for (b in 1:2000) {    m1.bs <- mean(sample(earn.ne,n1,replace=T))    m2.bs <- mean(sample(earn.mw,n2,replace=T))    ratio <- c(ratio,m1.bs/m2.bs)  }   sd(ratio)  quantile(ratio, c(.025,.975))  hist(ratio,breaks=20,probability=T); lines(density(ratio))  ###  ------------------------------------------------------------------###  Outliers in regression, florida county-level 2000 election results###  ------------------------------------------------------------------  # read data file  florida <- read.table("florida2000.txt", header=T)  summary(florida)  attach(florida)  plot(Reg.Reform, Buchanan)  identify(Reg.Reform, Buchanan, County)  regr <- lm(Buchanan ~ Reg.Reform)  summary(regr)  abline(regr)  # figure out how to get to regr results  names(regr)                  # list of the things in a regression  regr$coefficients            # extract one of these things named "coefficients"  as.vector(regr$coefficients) # convert it into numbers  # now we can do random resampling  # first make an array of data to sample  data <- data.frame(cbind(Reg.Reform,Buchanan))  n <- length(Buchanan)  # make a container to hold the slope and intercept, then sample counties  regr.bs <- matrix(nrow=2000, ncol=2)     for(b in 1:nrow(regr.bs)) {    in.star <- sample(1:n,n,replace=T)    data.star <- data[in.star,]    regr.star <- lm(Buchanan ~ Reg.Reform, data=data.star)    regr.bs[b,] <- as.vector(regr.star$coefficients)  }  regr.bs[1:10,] # estimates of first 10 bs samples  sd(regr.bs)  quantile(regr.bs[,2], c(.025,.975))  hist(regr.bs[,2],breaks=20,probability=T); lines(density(regr.bs[,2]))   # do residual resampling  fit <- regr$fitted.values;  res <- regr$residuals;  regr.bs.fixed <- matrix(nrow=2000, ncol=2)     for(b in 1:nrow(regr.bs.fixed)) {    in.star <- sample(1:n,n,replace=T)    yStar = fit + res[in.star]    regr.star <- lm(yStar ~ Reg.Reform)    regr.bs.fixed[b,] <- as.vector(regr.star$coefficients)  }  sd(regr.bs.fixed)  quantile(regr.bs.fixed[,2], c(.025,.975))  hist(regr.bs.fixed[,2],breaks=20,probability=T,main="Histogram of regr.bs[,2]");   lines(density(regr.bs.fixed[,2]))###  ------------------------------------------------------------------###  Longitudinal Example###  ------------------------------------------------------------------ 
   data <- read.table("sales.txt", header=TRUE)
   
   summary(regr <- lm(data[,"sales"] ~ factor(data[,"i"]) + data[,"un"] + data[,"di"]))  # --- Compute residual corr  res <- residuals(regr)  num <- den <- 0  for (i in 1:25) {    for(t in 2:8) {      index <- (i-1)*8      num <- num + res[index+t]*res[index+t-1]      den <- den + res[index+t-1]^2  }}  num/den  # --- Sandwich estimator... start by building X matrix  dummy <- matrix(0, nrow=200, ncol=24)  for(col in 1:24) {    first.row <- (col-1)*8;    for(j in 1:8) { dummy[first.row+j,col]=1 }  }  X <- as.matrix(cbind(1,dummy,data[,c("un","di")]))  XpX <- (t(X)) %*% X  XpXi <- solve(XpX)          # inverse of matrix  # --- Build up a block-diagonal matrix for errors  S <- matrix(0,nrow=200, ncol=200)  for(i in 1:25) {    base <- (i-1)*8    for(j in 1:8) {      for (k in 1:8)        S[base+j,base+k] <- res[base+j]*res[base+k] }}    XpSX <- (t(X)) %*% S %*% X  # --- Compute the sandwich SE  sand.se <- sqrt(diag(XpXi %*% XpSX %*% XpXi))  # --- Compare to the standard OLS formula  b <- XpXi %*% (t(X) %*% data[,"sales"])  ols.se <- sqrt( (res %*% res) * (diag(XpXi)) / (200-27))   (cbind(ols.se, sand.se))[c(1,2,26,27),]  # --- Now try bootstrapping, resampling districts  (this takes a while!)  regr.bs <- matrix(nrow=500,ncol=27)  for(b in 1:nrow(regr.bs)) {    in.star <- rep(sample(8*(0:24),25,replace=T),rep(8,25))+rep(1:8,25)    data.star <- data[in.star,]    data.star[,"i"] <- rep(1:25,rep(8,25))    regr.star <- lm(data.star[,"sales"] ~ factor(data.star[,"i"]) + data.star[,"un"] + data.star[,"di"])    regr.bs[b,] <- as.vector(regr.star$coefficients)  }  # --- Get the SE for intercept, one effect, and the last two cols  sd(regr.bs[,c(1,2,26,27)])######  Double bootstrap###pct.interval <- function(x,B) {  n<-length(x); s2.star<-c();  for (b in 1:B) s2.star <- c(s2.star, var(sample(x,n,replace=T)))  quantile(s2.star,c(0.05,0.95))}  # --- Do it for a sample from normal  intervals <- matrix(nrow=200, ncol=2)  for (i in 1:nrow(intervals)) {    intervals[i,] <- pct.interval(rnorm(20), 1000) }    cover <- intervals[,1]<1 & 1 < intervals[,2]  nrow(intervals[cover,])/nrow(intervals)      # --- Do it for a sample from the sample itself, with extra percentiles to aid calibrationpct.interval <- function(x,B) {  n<-length(x); s2.star<-c();  for (b in 1:B) s2.star <- c(s2.star, var(sample(x,n,replace=T)))  quantile(s2.star,c(0.01, 0.015, 0.02, 0.03, 0.04, 0.05, 0.95, 0.96, 0.97, 0.98, 0.985, 0.99))}  len <- 20;  data <- rnorm(len)  intervals <- matrix(nrow=200, ncol=12)  for (i in 1:nrow(intervals)) {    intervals[i,] <- pct.interval(sample(data,len,replace=T), 1000) }    s2 <- ((len-1)/len) * var(data)  cover <- intervals[,1]<s2 & s2 < intervals[,12]  nrow(intervals[cover,])/nrow(intervals)