# Demo code for Bootstrap 3 # --- Access a collection of supplied routines search() library(boot) search() # --- Example of bootstrapping correlation 15 law schools law <- read.table(":Data:lsat.dat",header=T) attach(law) plot(lsat, gpa) cor(lsat, gpa) # glue together so can look at pairs data <- cbind(lsat, gpa) data[1,] # first row cor(data) # correlation matrix cor(data[,1], data[,2]) # correlation value # fancy placeholder -- faster code, less mem use, *more* careful cor.bs <- vector(mode="numeric", length=1000) for(b in 1:length(cor.bs)) { data.star <- data[sample(1:15,15,replace=T),] cor.bs[b] <- cor(data.star[,1],data.star[,2]) } cor.bs[1:10] # correlation of first 10 bs samples hist(cor.bs,probability=T); lines(density(cor.bs)) quantile(cor.bs,c(.025,.975)) # --- Load and attach data on Florida voting in 2000 US election Florida <- read.table(":Data:florida2000.dat", header=T) names(Florida) labels(Florida) attach(Florida) # --- Example of regression, with an outlier (point in row 50) plot(regReform,buchanan) identify(regReform,buchanan) plot(regReform,buchanan) # nicer labels identify(regReform,buchanan,labels=dimnames(Florida)[[1]]) # give data set explicitly for labels regr <- lm(buchanan ~ regReform, data=Florida) summary(regr) abline(regr) par(mfrow=c(2,2)); plot(regr); par(mfrow=c(1,1)) plot(regReform,buchanan) abline(regr) regr.del <- lm(buchanan~regReform,data=Florida[-50,]) abline(regr.del, col="red") # figure out how to get to regr results names(regr) # list of the things in a regression regr$coefficients # extract one of these things named "coefficients" as.vector(regr$coefficients) # convert it into numbers # now we can do the resampling # first make an array of data to sample data <- data.frame(cbind(regReform,buchanan)) n <- length(buchanan) # then make a container to hold the slope and intercept # be patient! regr.bs <- matrix(nrow=1000, ncol=2) for(b in 1:nrow(regr.bs)) { in.star <- sample(1:n,n,replace=T) data.star <- data[in.star,] regr.star <- lm(buchanan ~ regReform, data=data.star) regr.bs[b,] <- as.vector(regr.star$coefficients) } regr.bs[1:10,] # estimates of first 10 bs samples # its easy, but does it make sense to do a confidence interval??? hist(regr.bs[,2],probability=T); lines(density(regr.bs[,2])) quantile(regr.bs[,2],c(.025,.975)) # what does this plot mean? plot(regr.bs) # --- Got to make it go faster! my.regr <- function(x,y) { slope <- cor(x,y) * sd(y)/sd(x) inter <- mean(y) - slope * mean(x) c(inter,slope) } # check that this version gives same result as R's lm(buchanan~regReform, data=Florida) my.regr(data[,1], data[,2]) # make a container and let it go ... much faster now regr.bs <- matrix(nrow=1000, ncol=2) for(b in 1:nrow(regr.bs)) { in.star <- sample(1:n,n,replace=T) data.star <- data[in.star,] regr.star <- my.regr(data.star[,1],data.star[,2]) regr.bs[b,] <- regr.star } regr.bs[1:10,] # estimates of first 10 bs samples hist(regr.bs[,2],probability=T); lines(density(regr.bs[,2])) # --- Alternative residual resampling method # start with the 'population' model regr <- lm(buchanan ~ regReform, data=Florida) # save model residuals and fit values x <- regReform yHat <- as.vector(regr$fitted.values) resid <- as.vector(regr$residuals) regr.bs <- matrix(nrow=1000, ncol=2) for(b in 1:nrow(regr.bs)) { resid.star <- sample(resid,n,replace=T) y.star <- yHat + resid.star regr.bs[b,] <- my.regr(x,y.star) } regr.bs[1:10,] # estimates of first 10 bs samples hist(regr.bs[,2],probability=T); lines(density(regr.bs[,2])) quantile(regr.bs[,2],c(.025,.975)) # --- Robust regression alternative # show data, with ols line with and without palm beach plot(regReform,buchanan) abline(regr) regr.del <- update(regr,subset=-50) abline(regr.del, col="red") # open a library of additional routines library(MASS) # MASS contains a tool for fitting robust regressions (Fox appendix on web) # the merged 'purple' line shows the two agree rob.regr <- rlm(buchanan ~ regReform, data=Florida) summary(rob.regr) abline(rob.regr, col="blue") names(rob.regr) plot(1:n, rob.regr$w) identify(1:n, rob.regr$w, labels=dimnames(Florida)[[1]]) # patience, for each robust regression is a sequence of weighted regressions rob.regr.bs <- matrix(nrow=500, ncol=2) for(b in 1:nrow(rob.regr.bs)) { in.star <- sample(1:n,n,replace=T) data.star <- data[in.star,] rob.regr.star <- rlm(buchanan ~ regReform, data=data.star) rob.regr.bs[b,] <- as.vector(rob.regr.star$coefficients) } # bootstrap gives rather different SE's as well as CI for the slope summary(rob.regr) # see asymtotic SE from robust fit apply(rob.regr.bs, 2, sd) # BS std error much larger for slope quantile(rob.regr.bs[,2],c(.025,.975)) # BS distribution of slope estimates is very skewed. hist(rob.regr.bs[,2],probability=T); lines(density(rob.regr.bs[,2])) # --- Dealing with heteroscedasticity. # work with data omitting Palm Beach detach(Florida) florida <- Florida[-50,] attach(florida) plot(regReform,buchanan) regr <- lm(buchanan ~ regReform) summary(regr) abline(regr) par(mfrow=c(2,2)); plot(regr); par(mfrow=c(1,1)) plot(fitted(regr), residuals(regr)) abline(0,0) # count number of samples for each case count.dups <- function(x) { n <- length(x); counts <- as.vector(rep(0,n)); for(i in 1:n) counts[x[i]] <- 1+counts[x[i]] counts } count.dups(c(1,2,3,1,2,6)) data <- cbind(regReform,buchanan) # random resampling preserves the lack of constant variance which <- sample(1:66,66,replace=T) plot(data, cex=count.dups(which)) data.bs <- data[which,] # residual/fixed resampling blurs the variance (not good!) res <- residuals(regr) fit <- fitted(regr) which <- sample(1:66,66,replace=T) plot(regReform, fit + res[which]) # compare the results of bootstrap resampling (ols, without palm beach) B <- 1000; random <- matrix(0,B,2) for (b in 1:B) { which <- sample(1:66,66,replace=T); data.bs <- data[which,]; random[b,] <- my.regr(data.bs[,1], data.bs[,2]) } fixed <- matrix(0,B,2) for (b in 1:B) { which <- sample(1:66,66,replace=T); fixed[b,] <- my.regr(regReform, fit+res[which]) } # random gives much less SE to intercept (where var is small) par(mfrow=c(1,2)); boxplot(random[,1],fixed[,1]); boxplot(random[,2],fixed[,2]); par(mfrow=c(1,1)) # CIs for intercept quantile(random[,1],c(.025,.975)) quantile( fixed[,1],c(.025,.975)) # slopes quantile(random[,2],c(.025,.975)) quantile( fixed[,2],c(.025,.975)) # --- Smoothing and the bootstrap plot(regReform, buchanan); lines (lowess(regReform, buchanan), col="red"); for (i in 1:100) { which <- sample(1:66,66,replace=T); lines(lowess(regReform[which],buchanan[which])); }