# Demo code for Bootstrap 2 # --- Start with an example of bootstrapping mean of normals n <- 64 norm <- rnorm(n) # do these two a lot of times hist(norm) mean(norm) + 2 * c(-1,1) * sd(norm)/sqrt(n) mean(norm) + qt(c(.025,.975),n-1) * sd(norm)/sqrt(n) # start with empty container avg.bs <- c() # resample, calculate avg, save it for(b in 1:2000) { # start with a few, then add more norm.bs <- sample(norm, n, replace=T) avg.bs <- c(avg.bs, mean(norm.bs)) } # Look at the BS results sd(avg.bs); sd(norm)/sqrt(n) quantile(avg.bs, c(.025,.975)) hist(avg.bs,probability=T); lines(density(avg.bs)) # SE from bootstrap is even closer sd(avg.bs); sd(norm)/sqrt(n) # --- Osteoporosis data: CI for average osteo <- read.table(":Data:osteo.dat", header=T) # pull out the hip data (lots of ways to get at it) names(osteo) osteo[,"hip"] attach(osteo) # now can use column "variables" hip # kernel density estimate plot(density(hip)) points (hip,rep(0,length(hip))) # confidence interval for the average hip score summary(hip) mean(hip); sd(hip) mean(hip) + qt(c(.025,.975),63) * sd(hip)/sqrt(length(hip)) # Bootstrap version of confidence interval for average avg.bs <- c() for(b in 1:1000) avg.bs <- c(avg.bs,mean(sample(hip,64,replace=T))) sd(avg.bs) quantile(avg.bs,c(.025,.975)) # --- Osteoporosis data: CI for proportions with small score pct.small <- function(x) length(x[x < -3])/length(x) pct.small(hip) p <- pct.small(hip) p + c(-2,2) * sqrt(p * (1-p)/64) pct.bs <- c() for(b in 1:2000) pct.bs <- c(pct.bs,pct.small(sample(hip,64,replace=T))) quantile(pct.bs,c(.025,.975)) hist(pct.bs,probability=T); lines(density(pct.bs)) points(pct.bs, rep(0,length(pct.bs))) # --- Osteoporosis data: CI for difference in averages hip.no <- hip[ useEstrogen=="no" ] hip.yes <- hip[ useEstrogen=="yes"] # t tests and CI, without and with equal variance assumption t.test(hip.no, hip.yes) t.test(hip.no, hip.yes, var.equal=T) # --- Resampling for testing the difference via BS # Fixed sizes n.no <- length(hip.no) # 20 did not use estrogen n.yes <- length(hip.yes) diff.fixed.bs <- c() for(b in 1:2000) { avg.no <- mean(sample(hip.no,n.no,replace=T)) avg.yes <- mean(sample(hip.yes,n.yes,replace=T)) diff.fixed.bs <- c(diff.fixed.bs,avg.no-avg.yes) } quantile(diff.fixed.bs,c(.025,.975)) # Random sample from cases data <- cbind(hip,useEstrogen) # get the data paired data[,2] # 1=no, 2=yes diff.bs <- c() for (b in 1:2000) { in.star <- sample((1:64), 640, replace=T) x.bs <- data[in.star,] avg.no <- mean(x.bs[x.bs[,2]==1,1]) avg.yes<- mean(x.bs[x.bs[,2]==2,1]) diff.bs <- c(diff.bs,avg.no-avg.yes) } # compare the results with fixed and random resampling # random is a bit longer quantile(diff.fixed.bs,c(.025,.975)) quantile(diff.bs,c(.025,.975)) boxplot(diff.fixed.bs, diff.bs) # --- Trimmed mean, median, mean trimmed.mean <- function(x) { s <- sort(x); n <- length(x); trim <- ceiling(.05 * n); # 5% off each half mean(s[trim:(n-trim)]) } contaminated.hip <- c(hip,10); hist(contaminated.hip) avg.bs <- med.bs <- trim.bs <- c() for (b in 1:1000) { hip.bs <- sample(contaminated.hip,length(contaminated.hip),replace=T) avg.bs <- c(avg.bs,mean(hip.bs)) med.bs <- c(med.bs,median(hip.bs)) trim.bs <- c(trim.bs,trimmed.mean(hip.bs)) } # about the same if no outlier, very different if one boxplot(avg.bs, trim.bs, med.bs, names=c("Avg", "Trim", "Med")) # get BS std error for each apply(cbind(avg.bs,trim.bs,med.bs),2, sd)