# Demo code for SPIDA 2005 lecture in Toronto # Start with the osteoporosis example by reading in the # data. You might need to add a path to this command on your # system. I put the data file in my 'working directory.' osteo <- read.table("osteo.txt", header=T) # --- pull out the hip data (lots of ways to get at it) names(osteo) osteo[,"hip"] attach(osteo) hip # --- kernel density estimate plot(density(hip)) points (hip,rep(0,length(hip))) # --- confidence interval for the average hip score summary(hip) mean(hip); sd(hip) mean(hip) + 2 * c(-1,1) * sd(hip)/sqrt(length(hip)) # Bootstrap standard error, without simulation (sqrt(63/64) * sd(hip)) / sqrt(64) # --- Bootstrap version of confidence interval for average avg.bs <- c() for(b in 1:2000) { yStar <- sample(hip,64,replace=T); avg.bs <- c(avg.bs, mean(yStar)) } sd(avg.bs) # this is the simulated BS std error plot(density(avg.bs)) hist(avg.bs, breaks=25) # BS sampling dist quantile(avg.bs,c(.025,.975)) # pctile interval ### ------------------------------------------------------------------ ### Bootstrap test ### ------------------------------------------------------------------ # --- Osteoporosis data: CI for difference in averages hip.no <- hip[ estrogen=="no" ]; n.no <- length(hip.no) hip.yes <- hip[ estrogen=="yes"]; n.yes <- length(hip.yes) # t tests and CI, without and with equal variance assumption t.test(hip.no, hip.yes) t.test(hip.no, hip.yes, var.equal=T) # --- Center the two groups under null at grand mean hip.no.0 <- hip.no - mean(hip.no) + mean(hip) hip.yes.0 <- hip.yes - mean(hip.yes) + mean(hip) # --- Sample from each, compare diffs <- c() for (i in 1:1000) { mean.no <- mean(sample(hip.no.0, n.no, replace=T)) mean.yes <- mean(sample(hip.yes.0,n.yes,replace=T)) diffs <- c(diffs, mean.no - mean.yes) } hist(diffs, breaks=20, probability=T); lines(density(diffs)) length( diffs[abs(diffs)>0.352] ) # count number larger than obs, BS p-value ### ------------------------------------------------------------------ ### Trimmed mean ### ------------------------------------------------------------------ trimmed.mean <- function(x) { s <- sort(x); n <- length(x); trim <- ceiling(.10 * n); # 10% off each tail of data mean(s[trim:(n-trim)]) } avg.bs <- med.bs <- trim.bs <- c() for (b in 1:2000) { hip.bs <- sample(hip,length(hip),replace=T) avg.bs <- c(avg.bs,mean(hip.bs)) med.bs <- c(med.bs,median(hip.bs)) trim.bs <- c(trim.bs,trimmed.mean(hip.bs)) } sd(trim.bs) quantile(trim.bs, c(.025,.975)) hist(trim.bs,breaks=20,probability=T); lines(density(trim.bs)) ### Contaminated data with outlier contaminated.hip <- c(hip,8); hist(contaminated.hip, breaks=20) avg.bs <- med.bs <- trim.bs <- c() for (b in 1:200) { hip.bs <- sample(contaminated.hip,length(contaminated.hip),replace=T) avg.bs <- c(avg.bs,mean(hip.bs)) med.bs <- c(med.bs,median(hip.bs)) trim.bs <- c(trim.bs,trimmed.mean(hip.bs)) } sd(trim.bs) quantile(trim.bs, c(.025,.975)) hist(trim.bs,breaks=20,probability=T); lines(density(trim.bs)) # get percentile interval for each pct.interval <- function(x) { quantile(x,c(.025,.975)) } apply(cbind(avg.bs,trim.bs,med.bs),2, pct.interval) # see relationship among estimators (smaller B if saving plot) pairs(cbind(avg.bs, trim.bs, med.bs)) ### ------------------------------------------------------------------ ### Ratio of averages, earnings data ### ------------------------------------------------------------------ # read data file earn <- read.table("earnings.txt", header=T) summary(earn) attach(earn) earn.ne <- earnings[region=="NE"] earn.mw <- earnings[region=="Midwest"] # Terms for taylor series m1 <- mean(earn.ne); m2 <- mean(earn.mw) n1 <- length(earn.ne) ; n2 <- length(earn.mw) s1 <- sd(earn.ne) ; s2 <- sd(earn.mw) m1/m2 sqrt(s2^2/(n1 * m2^2) + s2^2 * m1^2 / (n2 * m2^4)) # bootstrap ratio of means ratio <- c() for (b in 1:2000) { m1.bs <- mean(sample(earn.ne,n1,replace=T)) m2.bs <- mean(sample(earn.mw,n2,replace=T)) ratio <- c(ratio,m1.bs/m2.bs) } sd(ratio) quantile(ratio, c(.025,.975)) hist(ratio,breaks=20,probability=T); lines(density(ratio)) ### ------------------------------------------------------------------ ### Outliers in regression, florida county-level 2000 election results ### ------------------------------------------------------------------ # read data file florida <- read.table("florida2000.txt", header=T) summary(florida) attach(florida) plot(Reg.Reform, Buchanan) identify(Reg.Reform, Buchanan, County) regr <- lm(Buchanan ~ Reg.Reform) summary(regr) abline(regr) # figure out how to get to regr results names(regr) # list of the things in a regression regr$coefficients # extract one of these things named "coefficients" as.vector(regr$coefficients) # convert it into numbers # now we can do random resampling # first make an array of data to sample data <- data.frame(cbind(Reg.Reform,Buchanan)) n <- length(Buchanan) # make a container to hold the slope and intercept, then sample counties regr.bs <- matrix(nrow=2000, ncol=2) for(b in 1:nrow(regr.bs)) { in.star <- sample(1:n,n,replace=T) data.star <- data[in.star,] regr.star <- lm(Buchanan ~ Reg.Reform, data=data.star) regr.bs[b,] <- as.vector(regr.star$coefficients) } regr.bs[1:10,] # estimates of first 10 bs samples sd(regr.bs) quantile(regr.bs[,2], c(.025,.975)) hist(regr.bs[,2],breaks=20,probability=T); lines(density(regr.bs[,2])) # do residual resampling fit <- regr$fitted.values; res <- regr$residuals; regr.bs.fixed <- matrix(nrow=2000, ncol=2) for(b in 1:nrow(regr.bs.fixed)) { in.star <- sample(1:n,n,replace=T) yStar = fit + res[in.star] regr.star <- lm(yStar ~ Reg.Reform) regr.bs.fixed[b,] <- as.vector(regr.star$coefficients) } sd(regr.bs.fixed) quantile(regr.bs.fixed[,2], c(.025,.975)) hist(regr.bs.fixed[,2],breaks=20,probability=T,main="Histogram of regr.bs[,2]"); lines(density(regr.bs.fixed[,2])) ### ------------------------------------------------------------------ ### Longitudinal Example ### ------------------------------------------------------------------ data <- read.table("sales.txt", header=TRUE) summary(regr <- lm(data[,"sales"] ~ factor(data[,"i"]) + data[,"un"] + data[,"di"])) # --- Compute residual corr res <- residuals(regr) num <- den <- 0 for (i in 1:25) { for(t in 2:8) { index <- (i-1)*8 num <- num + res[index+t]*res[index+t-1] den <- den + res[index+t-1]^2 }} num/den # --- Sandwich estimator... start by building X matrix dummy <- matrix(0, nrow=200, ncol=24) for(col in 1:24) { first.row <- (col-1)*8; for(j in 1:8) { dummy[first.row+j,col]=1 } } X <- as.matrix(cbind(1,dummy,data[,c("un","di")])) XpX <- (t(X)) %*% X XpXi <- solve(XpX) # inverse of matrix # --- Build up a block-diagonal matrix for errors S <- matrix(0,nrow=200, ncol=200) for(i in 1:25) { base <- (i-1)*8 for(j in 1:8) { for (k in 1:8) S[base+j,base+k] <- res[base+j]*res[base+k] }} XpSX <- (t(X)) %*% S %*% X # --- Compute the sandwich SE sand.se <- sqrt(diag(XpXi %*% XpSX %*% XpXi)) # --- Compare to the standard OLS formula b <- XpXi %*% (t(X) %*% data[,"sales"]) ols.se <- sqrt( (res %*% res) * (diag(XpXi)) / (200-27)) (cbind(ols.se, sand.se))[c(1,2,26,27),] # --- Now try bootstrapping, resampling districts (this takes a while!) regr.bs <- matrix(nrow=500,ncol=27) for(b in 1:nrow(regr.bs)) { in.star <- rep(sample(8*(0:24),25,replace=T),rep(8,25))+rep(1:8,25) data.star <- data[in.star,] data.star[,"i"] <- rep(1:25,rep(8,25)) regr.star <- lm(data.star[,"sales"] ~ factor(data.star[,"i"]) + data.star[,"un"] + data.star[,"di"]) regr.bs[b,] <- as.vector(regr.star$coefficients) } # --- Get the SE for intercept, one effect, and the last two cols sd(regr.bs[,c(1,2,26,27)]) ### ### Double bootstrap ### pct.interval <- function(x,B) { n<-length(x); s2.star<-c(); for (b in 1:B) s2.star <- c(s2.star, var(sample(x,n,replace=T))) quantile(s2.star,c(0.05,0.95)) } # --- Do it for a sample from normal intervals <- matrix(nrow=200, ncol=2) for (i in 1:nrow(intervals)) { intervals[i,] <- pct.interval(rnorm(20), 1000) } cover <- intervals[,1]<1 & 1 < intervals[,2] nrow(intervals[cover,])/nrow(intervals) # --- Do it for a sample from the sample itself, with extra percentiles to aid calibration pct.interval <- function(x,B) { n<-length(x); s2.star<-c(); for (b in 1:B) s2.star <- c(s2.star, var(sample(x,n,replace=T))) quantile(s2.star,c(0.01, 0.015, 0.02, 0.03, 0.04, 0.05, 0.95, 0.96, 0.97, 0.98, 0.985, 0.99)) } len <- 20; data <- rnorm(len) intervals <- matrix(nrow=200, ncol=12) for (i in 1:nrow(intervals)) { intervals[i,] <- pct.interval(sample(data,len,replace=T), 1000) } s2 <- ((len-1)/len) * var(data) cover <- intervals[,1]