#################################################################### ######## STATISTICS 541, GRADUATE LEVEL STATISTICAL METHODS ######## #################################################################### #- Some quiz questions about R. # * What are the basic data types? # * For those who know C, what is the difference in handling text data # between R and C? # * What is a vector? a matrix? an array? # * What are the ways of indexing a vector? # * What happens generally if two vectors are mismatched in length # when they should be of the same length? # * What happens when a vector gets indexed with a value outside # 1:length(vector)? # * Same question for matrices in either dimension. # * What happens when a matrix or array gets indexed like a vector? # * What happens if a 3-D array gets indexed like a matrix? # * What happens generally when R expects a certain basic data type # but is given the wrong type? As in paste(1,2, sep="") paste(T,2, sep="") "1" + "2" T + 3 3 | T # * How would you program a discrete Brownian motion simulation that steps # up/down by 1 with equal probability? (animation) plot(cumsum(sample(c(-1,1),size=10000,replace=T)), type="l") # * How would you design a random process that is always the highest value # seen to date in a discrete Brownian motion simulation? plot(cummax(cumsum(sample(c(-1,1),size=10000,replace=T))), type="l") #- R: Remaining material of "Stat-541-R-intro.R" # * For details, see 'help()' # * Two more composite data types # . Lists: 'list()' and 'unlist()' # . Data frames: 'data.frame()' # - purpose # main use of lists: statistical models # main use of data frames: statistical data tables with # cols of variable basic data type # - coercion: data frame <-> matrix # - reading data frames: 'read.table()', 'read.csv()' # - column and row names # - implemented as lists of vectors # * Flow control: # . conditionals: 'if'-'else', but also the 'ifelse()' function # . loops: for, repeat, while # . implicit looping with # - 'apply()' for looping over matrix and array dimensions # - 'lapply()' for looping over lists # - 'tapply()' for over 'ragged data' (unequal number of obs. per case) # * String manipulations: needed for HW 3 # - 'grep()', 'nchar()', 'paste()', 'strsplit()', 'substring()' # - a use of 'grep()' in R: the 'apropos()' function # apropos("^q") # apropos("^r.+s$") # * Distributions: # . two types: # - continuous: unif, norm, lnorm, t, cauchy, f, chisq, exp, gamma, beta, logis,... # - discrete: binom, nbinom, pois, hyper, geom, multinom # - most have r, q, p, d version: # . pseudo-random numbers: rnorm(n=5, m=10, s=3) # . quantiles: qnorm(p=.5, m=10, s=3) # . probabilities: pnorm(q=5, m=6, s=1) # . densities: dnorm(x=1, m=6, s=2) # - do not forget: 'sample()' for sampling from discrete # distributions with and w/o replacement # - simulation of new distributions is reduced to simulation of one # of the above. # Recipe for sampling from a univariate distribution with cdf F? # - reproducibility of simulations: setting the seed # * Plotting: # - principle: iterative debugging and refinement # - Example: # . Fine tuning a single plot: x <- runif(1000); y <- x^2 + rnorm(1000, s=.1) plot(x, y) plot(x, y, pch=16) plot(x, y, pch=16, cex=.5) plot(x, y, pch=16, cex=.5, main="My first R plot") # . Multiple plots: par(mfrow=c(2,3)) for(i in 1:6) { x <- runif(1000); y <- x^2 + rnorm(1000, s=.1) plot(x, y, pch=16, cex=.5, main=i) } # . New window of given size: windows(width=8, height=6) par(mfrow=c(2,3), mar=c(3,3,3,1), mgp=c(1.8,.5,0)) for(i in 1:6) { x <- runif(1000); y <- x^2 + rnorm(1000, s=.1) plot(x, y, pch=16, cex=.5, main=paste("Plot",i)) } # . PDF device driver for inclusion in papers: pdf(file="My First R Plot.pdf", width=6, height=4.5) par(mfrow=c(2,3), mar=c(3,3,3,1), mgp=c(1.8,.5,0)) for(i in 1:6) { x <- runif(1000); y <- x^2 + rnorm(1000, s=.1) plot(x, y, pch=16, cex=.5, main=paste("Plot",i)) } graphics.off() # - many more plotting functions to build up complex plots: # 'text()', 'lines()', 'points()', 'polygon()', 'rect()', 'legend()', ... # ################################################################# # # LINEAR MODELS: RECAP AND MBA STATISTICS # # The feature of linear methods that makes them special is their # # INTERPRETABILITY and COMPUTATIONAL SIMPLICITY # ################# # # TOY DATA: Accounting Rates versus Market Rates (from Myers, p.16) URL <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/accounting.dat" acct <- read.table(URL, header=T) # Sanity checking: names(acct) dim(acct) # Plotting: windows(width=9, height=9) plot(acct[,2:1], pch=16, xlim=range(acct), ylim=range(acct)) lines(c(0,30),c(0,30)) # Useful because the two variables measure the 'same'. # (Note we're equalizing the x- and y-range for 45 degree comparison.) # Here is a closer look at the data: for(i in 1:10) identify(acct[,c(2,1)], labels=rownames(acct), cex=.7) # ################ # # LINEAR REGRESSION CONCEPTS: # # - a quantitative response/dependent_variable/y-variable # # to be predicted/explained/approximated # # - by a linear function of a number of quantitative # predictors/regressors/carriers/independent_variables/x-variables # # - Assumed model: # # y = beta0 + beta1*x1 + beta2*x2 +... betap*xp + normal error N(0,sigma^2) # # Same: # # y ~ N(beta0 + beta1*x1 + beta2*x2 +... betap*xp, sigma^2) (stochast. indep.) # # This is a GENERATIVE MODEL in the sense that # - for a given set of predictor columns x1,...,xp and # - assumed values of betaj's and sigma, # we can simulate possible responses y. # For example for the accounting data: x <- acct[,"Acct.Rate"] beta0 <- 0.85 beta1 <- 0.60 sigma <- 5.00 y <- beta0 + beta1**x + rnorm(length(x), m=0, s=sigma) windows(w=10,h=5); par(mfrow=c(1,2)) plot(y ~ x, pch=16, main="Simulated", xlab="Acct.Rate", ylab="Mkt.Rate") plot(acct[,2:1], pch=16, main="Actual", xlab="Acct.Rate", ylab="Mkt.Rate") # # Terms for linear models: # simple linear regression: just one predictor variable x # population/true/model parameters: # beta0 = population/true intercept # betaj = population/true slopes (j>0) # sigma = error standard deviation # sample estimates: # b0 = estimate of intercept beta0 # bj = estimate of slope betaj # yhat = b0 + b1*x1 + b2*x2 +... bp*xp ("fits") # RSS = sum((y-yhat)^2 "Residual Sum of Squares" # s = RMSE = sqrt(RSS/(N-p-1)) ("Root Mean Square Error") # = estimate of sigma # R2 = 1 - RSS/sum((y-mean(y))^2) = cor(y,yhat)^2 ("R Square") # # ################ # # COMPUTING SIMPLE REGRESSION IN R: # # Regressing y=Mkt.Rate on x=Acct.Rate: acct.lm <- lm(Mkt.Rate ~ Acct.Rate, data=acct) # # Comments: # - 'lm()' is the linear models function # - Its first argument is a 'model formula' of the form 'y ~ x'. (More below.) y ~ x # Model formulas are R objects ('first class citizens'). class(y ~ x) # - The second argument must be a data frame or a matrix. # - Btw, plotting can also be done with model formulas. This regenerates the plot: plot(Mkt.Rate ~ Acct.Rate, data=acct, pch=16, xlim=range(acct), ylim=range(acct) # - The model formula terms must be column names of the data frame. # - 'lm()' returns a list that describes the model. is.list(acct.lm) # Thus, models are also 'first class citizens'. # # Unraveling the lm model: # There are several auxiliary functions that get at the lm information, # but 'print()' is not one of them: print(acct.lm) # To see what's in it, ask for the names of its elements: names(acct.lm) # Hence one can ask for components in principle as follows: acct.lm$coefficients acct.lm$residuals # But this is not how we are supposed to get at the model information. # Here is how one prints the conventional regression summary: summary(acct.lm) # For more control, you can assign the summary information: acct.lm.sm <- summary(acct.lm) # Again it's a list: is.list(acct.lm.sm) # Its components are named: names(acct.lm.sm) # In particular slopes, stderrors, t-values, p-values are in: acct.lm.sm$coeff # component names need to be specified only to uniqueness acct.lm.sm$coeff[,"t value"] # to just extract the t-values acct.lm.sm$coeff[,3] # same thing # The raw intercept/slope are in either of acct.lm$coeff coef(acct.lm) # and they can be used to draw the fitted line specified by a/b=intercept/slope: abline(coef=coef(acct.lm)) # # Summary: For simple regressions you want look at the following items. acct.lm <- lm(Mkt.Rate ~ Acct.Rate, data=acct) summary(acct.lm) plot(Mkt.Rate ~ Acct.Rate, data=acct) abline(coef=coef(acct.lm)) # # Interpretation of estimation and inference: # Traps: causality, estimates vs parameters, averages # interpretation of b1? # "..." # interpretation of b0? # ("...") # interpretation of stderr(b1)? !!!!!!!!!!!!!!!!!!!!!!!!!!! # ("...") # interpretation of t(b1)? # ("...") # interpretation of p-value(b1)? # ("...") # interpretation of s? (R's "Resid. stderr" is a bad term) # ("...") # interpretation of R Square? # ("...") # interpretation of p-value? # ("...") # # The exclamation signs above should alert you that std. errors # are among the most fundamental and most difficult to understand # concepts of all of statistics. # A few points: # - stderr(b1) is really an ESTIMATE of sigma(b1). # - sigma(b1) is the TRUE standard deviation of b1. # One should really refer to sigma(b1) and stderr(b1) # as the standard error and the standard error estimate. # - sigma(b1) measures the dispersion of b1 ACROSS DATASETS # - These datasets are assumed to originate from the same underlying model. # - The difficulty is to understand that we are holding one dataset # and one estimate b1 in hand, but that we keep in mind that b1 # would have been slightly different in different data collections. # - Sometimes it is difficult to conceive of different data collections. # In this case, think in terms of different but similar worlds. # (E.g., stock market time series.) # # (We'll worry about diagnostics another time. # We now move on to certain non-linear models # useful in the business world. They can all be # reduced to linear models.) # # ################ # # MBA STATISTICS: # # Before taking stats, MBAs take a math brush-up. # They learn some of the following concepts: # # 1) exponential growth/decay # 2) logarithmically diminishing returns # 3) constant elasticity # 4) total cost vs ave cost; fixed cost vs variable cost # # General Comments on 1)-3): # # - Models 1)-3) are based on logarithmic transformations # of y, of x, and of both. # # - Logarithmic transformation should be considered as a possibility # whenever the following two are the case: # * the variable (x or y) takes on only positive values. # * changes/differences in the variable are commonly # expressed in percentages/multiples, not numeric differences. # Such variables are said to be on a "RATIO SCALE". # By contrast, variables # * that can take on positive and negative values and # * whose changes/differences are expressed in numeric differences # are said to be on an "INTERVAL SCALE". # Examples of each? # Examples that are neither? # (=> generalized exponential models, stay tuned...) # The point of logarithmic transforms is to map ratio scales to interval scales!!! # # - Linear Approximation of Logarithms: # log(1+z) ~ z for small z (convention: small means |z|<.1) # Consequence: # log(y*(1+z)) = log(y) + log(1+z) ~ log(y) + z # # ---------------------------------------------- # | RULE: a difference in log(y) by z (|z|<0.1) | # | corresponds approximately to | # | a difference in y by z*100% | # ---------------------------------------------- # # Example: a difference in log(y) of 0.05 ~ a difference in y of ... # a difference in x of 1% ~ a difference in log(x) of .... # # IMPORTANT: This holds only for the natural log!!!! # - Caveat about logarithmic transforms of y: # An estimate of log(y) yields an estimate of y by exponentiating, # but: mean(log(y)) != log(mean(y)) # In fact, one value is systematically lower than the other; which? # yet: median(log(y)) == log(median(y)) # In general, if "ave. est." is interpreted as "median est.", # one can arbitrarily transform with monotone functions. ################ # # 1) EXPONENTIAL GROWTH/DECAY # # Example from the crazy dotcom years: growth of number of web servers URL <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/web-servers.dat" webs <- read.table(URL, header=T) webs <- read.table("data/web-servers.dat", header=T) colnames(webs) [1] "Web.Servers" "Log.Web.Servers" "Yrs.since.Jan.97" [4] "Time" "Year" "Month" [7] "Time.of.Exp.Growth" plot(webs[,c("Time","Web.Servers")], type="l", col="blue") points(webs[,c("Time","Web.Servers")],pch=16, cex=.5) # This is not all the way exponential, but it might be piecewise. # Extract the span in which exponential growth was supposed # to have happened, 1997 - end of 2000 webs.red <- webs[as.logical(webs[,"Time.of.Exp.Growth"]), c("Yrs.since.Jan.97","Web.Servers")] colnames(webs.red) <- c("year","nserv") plot(webs.red, type="b", pch=16, cex=.5) # Zero is Jan 1997. # Exponential growth calls for logging the response: plot(webs.red, type="b", log="y", pch=16, cex=.5) # Or: plot(webs.red[,1], log10(webs.red[,2]), ylab="Log10", type="b", xlab="yr", pch=16, cex=.5) # So we model log(y) ~ b0 + b1 * x + error abline(lm(log10(nserv) ~ year, data=webs.red), lwd=2) # # Meaning: summary(lm(log(nserv) ~ year, data=webs.red)) ---- Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 13.408146 0.021395 626.7 <2e-16 *** year 0.900178 0.008834 101.9 <2e-16 *** ---- # log(nserv) ~ 13.4 + 0.9 * year + error # nserv ~ exp( 13.4 ) * exp( 0.9 * year ) * exp( error ) # ^^^^^^^^^^^ exp( 0.9 )^year * " # 660000 * 2.46^year * " plot(webs.red, pch=16, xlab="yr", ylab="nserv") lines(x=webs.red[,1], y=660000*2.46^webs.red[,1]) # Not perfect but adequate for now because we are more # interested in interpretation. # # Interpretation: We start with 660000 servers in Jan 1997. # From there on we have a yearly increase # by a factor of 2.46, or 146%. # # Using linear approximation for log(y)-models: # log(yhat) = b0 + b1*x # difference in x by dx # => difference in log(yhat) by b1*dx # => "difference" in yhat by b1*dx*100% if |b1*dx|<0.1 # # Web server data: # # b1 = .9 = log-increase in webservers per year # ==> dx = 1yr is much too large for linear approximation. # # try dx = 1 month: b1*dx = b1/12 = 0.075 # ==> small enough for linear approximation # b1/12*100 = 7.5% = (est. ave.) percentage increase per month # [compounded this should amount to a 146% yearly increase, # but here we see the approximation error: # this monthly rate that compounds to 1.075^12 [1] 2.381780 # instead of 2.46. The more exact rate would be 2.46^(1/12) [1] 1.077899 # or closer to a 7.8% increase per month # Still, linear approximation for log-differences |z|<.1 are commonly used. # ################ # # 2) LOGARITHMICALLY DIMINISHING RETURNS # # Example: Wine display space; the more space, the more sales, # but: going from 1 ft to 2 ft has probably a greater effect # than going from 9 ft to 10 ft URL <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/wine.dat" wine <- read.table(URL, header=T) colnames(wine) [1] "display.feet" "sales" # Parallel sort according to ft for plotting curves in order of predictor: wine <- wine[order(wine[,1]),] # Response: number of bottles sold summary(wine) # plot(wine[,1], wine[,2], pch=16, xlab="Display ft", ylab="Wine Sales") # # Idea: There may not be a constant increase in sales per unit increase # but per percentage increase in display space. # ==> take log(x). plot(log(wine[,1]), wine[,2], pch=16, xlab="log Display ft", ylab="Wine Sales") # Looks nearly ok. Ok enought to try a linear model: wine.lm <- lm(sales ~ log(display.feet), data=wine) abline(wine.lm) # # Meaning of the line: summary(wine.lm) #---- Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 83.560 14.413 5.797 6.24e-07 *** log(display.feet) 138.621 9.834 14.096 < 2e-16 *** #---- # # Fitted Equation: (est. ave.) sales ~ 140 * log(display.feet) + 84 # plot(wine, pch=16, xlab="log Display ft", ylab="Wine Sales") lines(x=wine[,1], y=84+139*log(wine[,1])) # # Magic of logs again: # 10% difference in display.feet # => (est. ave.) difference in sales is about 140 * 0.1 = 14 bottles # # General: fit y ~ b0 + b1*log(x) # 10% difference in x => ~b1/10 difference in yhat. (est. ave.) # 1% difference in x => ~b1/100 difference in yhat. ( " ) # ################ # # 3) CONSTANT ELASTICITY: # # Example: Fedex has(had) a delivery service called "CourierPak". # At different prices, different numbers of CourierPak deliveries # were requested by customers. URL <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/Courier.dat" courier <- read.table(URL, header=T) colnames(courier) plot(courier, pch=16) # One could fit a linear model for starters: summary(lm(Volume ~ Price, courier)) abline(lm(Volume ~ Price, courier)) # Looks quite good, but one doesn't like straight lines for a couple # of reasons: # 1) Lines cross into negatives, which does not happen (Vol>0). # The minimum volume is zero. # 2) More importantly, by economic theory it is not a unit increase # in Price that is associated with a constant decrease in Volume. # It is a PERCENTAGE increase in Price that causes a PERCENTAGE decrease # in volume. # summary(lm(log(Volume) ~ log(Price), courier)) #---- Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.04930 0.20145 39.957 < 2e-16 *** log(Price) -0.32732 0.07961 -4.112 0.000397 *** #---- # The raw Price->Volume fit and the log(Price)->Log(Volume) perform # about the same, but the world prefers the latter. Interpretation: # # log(Volume) ~ 8 - 1/3*log(Price) + error # # Price difference of 1% ~ Volume difference of ... ? # Price difference of 1% # => log(Price) difference of about 0.01 # => log(Volume) difference of about (-1/3)*0.01 (|.|<0.1) # => Volume difference of about -1/3 % # In general: 1% difference in x ~ difference in yhat of b1 % # This is called "constant price elasticity". # # (Assumes |b1|<0.1. If this is not the case, choose a smaller dx: # dx=dPrice=0.001 => dy=dVolume=b1*0.1% .) # #### # # SUMMARY of 1)-3): # # 0) y = b0 + b1 * x + eps LINEAR ASSOCIATION # constant difference in x ~ constant difference in yhat # # 1) log(y) = b0 + b1 * x + eps EXPONENTIAL GROWTH/DECAY/DEPRECIATION # constant difference in x ~ fractional (%) difference in yhat # # 2) y = b0 + b1 * log(x) + eps DIMINISHING RETURNS (for b1>0) # fractional (%) difference in x ~ constant difference in yhat # # 3) log(y) = b0 + b1 * log(x) + eps CONSTANT ELASTICITY # fractional (%) difference in x ~ fractional (%) difference in yhat # ################ # # 4) FIXED COSTS/VARIABLE COSTS: # # Example: A manufacturer of 'blocks' gets orders of certain # quantities and production costs. It is clear # that total production cost is mostly driven by # quantity. But the manufacturer would like to know # what other factors make some orders more and others # less expensive. # A simple approach would be # lm(Tot.Cost ~ Units + other factors) # ==> Tot.Cost ~=~ b0 + b1*Units + ... + error # Interpretation: # - b1: VARIABLE COST, or average production cost per unit # - b0: FIXED COST for any order (usually setup cost) # - The error is constant for all sizes of orders. (Unrealistic?) # # One could argue, though, that whatever factors there are, # they are more likely to act at the per-unit level: # amount of material per block, # difficulty of material, # extra features ordered for the blocks, ... # Their cost effects might multiply up with the number # of units ordered. This suggests modeling the Average Cost, # Ave.Cost = Tot.Cost / Units # as opposed to the Total Cost: # lm(Ave.Cost ~ other factors) # Total cost would then be obtained as # Tot.Cost = Ave.Cost * Units # But there is still a problem: The model might not contain # all the factors that make up unit cost. For one thing, # we should have a way to have fixed cost in the model as well. # This seems like a drawback of choosing Average Cost as response. # The problem can be fixed, though: # introduce an additional predictor X = 1/Units. # See what happens: # lm(Ave.Cost ~ 1/Units + other factors) # ==> Ave.Cost ~ b0 + b1/Units + ... + error # A prediction equation for Tot.Cost is obtained # by multiplying up with Units: # ==> Tot.Cost ~ b0*Units + b1 + ...*Units + error*Units # Interpretation: # - b0: VARIABLE COST # - b1: FIXED COST # - The error increases with the size of an order. # # Hence both models have interpretations of b0 and b1 # as fixed and variable costs, in reverse order. # But they differ in their error structure, and in # how other factors affect the Total Cost of the order. # # A data example: Cost is given as Ave.Cost. blocks <- read.table("Data/blocks_red.dat", header=T) colnames(blocks) plot(Ave.Cost ~ Units, data=blocks, pch=16) # Maybe there is a slightly negative slope: lower Ave.Cost # for more Units? Economies of scale? # As an aside, here is Tot.Cost, in $1000: plot(Ave.Cost*Units/1000 ~ Units, data=blocks, pch=16) # Certainly doesn't look like a constant error variance! # # There is a problem, though: an extremely small order # visible when plotting against 1/Units: plot(1/blocks[,"Units"], blocks[,"Ave.Cost"], pch=16) # Remove: sel <- 1/blocks[,"Units"] > .5 plot(1/blocks[!sel,"Units"], blocks[!sel,"Ave.Cost"], pch=16) # # Naive model for Tot.Cost=Ave.Cost*Units: summary(lm(Ave.Cost*Units ~ Units, blocks[!sel,])) Estimate Std. Error t value Pr(>|t|) (Intercept) 4146.470 865.000 4.794 3.22e-06 *** Units 22.149 1.658 13.356 < 2e-16 *** # Model for Ave.Cost: summary(lm(Ave.Cost ~ I(1/Units), blocks[!sel,])) Estimate Std. Error t value Pr(>|t|) (Intercept) 31.082 2.273 13.673 < 2e-16 *** I(1/Units) 1502.211 416.821 3.604 0.000397 *** # Typically Tot.Cost models have much higher R^2 # than Ave.Cost models, but this is irrelevant: # The high R^2 stems from the trivial proportionality with Units, # which the Ave.Cost model factors out. # Often, Ave.Cost models are more plausible and # yield comparable predictions when multiplied up to Tot.Cost. # # Btw, the estimates of fixed and variable costs # differ wildly between the two approaches. # Which one would you believe? # # Upcoming Homework: commercial real estate rents, # with the variables listed below. # Translation: Tot.Cost = 'RentTotal', Units = 'SqftLease'. # Practice for you: # - Which variables should be considered as fixed cost, # and which as variable cost? # - Write down a Tot Cost model and an Ave Cost model. # # As a criterion for distinguishing fixed and variable cost # predictors, keep the following in mind: # - fixed cost predictors affect the rent in terms of a # fixed $ amount, no matter what the size of the property is; # - variable cost predictors scale up with the size of the # property; they affect the rent in terms of percentages # of the total rent. # Another way to distinguish them: # Does a change in the predictor affect the rent in terms # - of the whole property or # - of a square foot of the property? # # With these considerations in mind, how would you # introduce the following variables in the two equations: # RentTotal = b0 + b1*SqftLease + b2*... # RentTotal/SqftLease = a0 + a1*1/SqftLease + a2*... # # VARIABLE NAME DESCRIPTION # RentTotal Total annual rent of the lease # SqftLease Size of the lease in square feet # FirmType Majority type of firms in the building # (doctors, legal, business, government, other) # Age Age of the building in years # Renovation Number of years since last renovation # Wiring Yes, if building has new wiring # Occupancy Fraction of offices that are rented # Leaselength Length of the lease in years # Renewable Yes, if the lease is renewable # Location One of three locations: center of city, old/new suburb # DistCity Distance to the center of the city center in miles # DistAirp Distance to the airport in miles # DriveAirp Distance to the airport in driving time # DistHosp Distance to nearest hospital in miles # FloorLease The (lowest) floor where the lease is located # FloorsBldg Number of floors in the building # SqftFloor Size of a floor in square feet # Elevator Number of elevators # Parking Number of executive parking spaces included in rental # Restaurant Yes, if the building has a restaurant # Exercise Yes, if building has a health club #### # # # MODEL FITTING IN R AND S: MODEL FORMULAS AND MODELS # # Predictor-response data are so pervasive that special-purpose # syntax was introduced to describe their modeling: model formulas. # These formulas are "citizens of first class" in R and S, that is, # they are objects that can be passed around like other objects. # The same holds for models, which are objects returned by model fitting # functions. So now we have the following list of object types: # - data such as numbers, strings, logicals, vectors,... # - functions # - model formulas # - models # # Here is how it works: acct <- read.table("accounting.dat") # toy dataset lm(Mkt.Rate ~ Acct.Rate, data=acct) # computes linear model ('lm') # ^^^^^^^^ ^^^^^^^^^ refer to columns in 'acct' (a dataframe) # # Both formulas and models are "first class citizens": # * assigning the formulat to a variable: acct.formula <- Mkt.Rate ~ Acct.Rate # This formula expresses something like 'Mkt.Rate on/vs Acct.Rate'. # * assigning the model to a variable acct.model <- lm(acct.formula, data=acct) # The terms inside the formula are interpreted as column names of 'data=...'. # # Printing a model is uninformative: acct.model # Informative is applying the 'summary()' function # which produces conventional model summaries: summary(acct.model) # # MEANING: # Formulas are mere symbolic representations. # They start referring to something only when passed into a function: y ~ x1 + x2 # formula # y = response (does NOT have to exist; will be interpreted inside functions) # x1, x2 = predictors (dito) # + = inclusion in model (NOT addition) gaga.dat <- data.frame(x1=runif(20), x2=runif(20), y=rnorm(20)) # toy data lm(y~x1+x2, gaga.dat) # default order of arguments lm(data=gaga.dat, formula=y~x1+x2) # dito; args out of order need to be named # # SEMANTICS: # - The preferred usage is by providing a dataframe: lm(y ~ x1, gaga.dat) # (not a matrix; use 'as.data.frame(mat)' if necessary). # - Formula variables can also refer to global entities: lm(gaga.dat[,"y"] ~ gaga.dat[,"x1"]) yy <- gaga.dat[,"y"]; xx <- gaga.dat[,"x1"] lm(yy ~ xx) # But this use is generally discouraged. # # RULE: Almost all fitting functions in R and S (lm, glm, gam, tree,...) # require a model formula. Even plot can be used this way: plot(y ~ x1, gaga.dat, pch=16) # # COERCING FORMULAS: as.character(y ~ x1 + x2) # formula -> string as.formula("y ~ x1 + x2") # string -> formula # constructing a formula by pasting strings: charvec <- c("~","y","x1","+","x2") as.formula(paste(charvec[c(2,1,3:5)], collapse=" ")) # see help(paste) as.formula(paste(charvec[c(2,1,3:5)], collapse="")) # no difference # # DOCUMENTATION: do read this page!! help(formula) # although much of this documentation will only later become clear. # #### # # MODELS: # tips.dat <- read.table("Data/tips.dat", header=T) tips.model <- lm(TIP ~ ., tips.dat) # Operations on models: # - numerical summaries: summary(tips.model) # - graphical diagnostics: more on these later par(mfrow=c(2,2), mgp=c(1.8,.5,0), mar=c(3,3,2,2)) plot(tips.model, pch=16, cex=.8) # - Other model-related functions are: coef(tips.model) fitted.values(tips.model) # or fitted() residuals(tips.model) # or resid() deviance(tips.model) # RSS anova(tips.model) names(lm.influence(tips.model)) # Internals of models: attributes(tips.model) #------------------ output begin $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "xlevels" "call" "terms" "model" $class [1] "lm" #------------------ output end # The "names" attribute points you to all kinds of interesting internals: tips.model$coefficients # same as "coef(tips.model)": regr coeffs tips.model$fitted.values # same as "fitted(tips.model)": vec of fitted vals tips.model$residuals # same as "resid(tips.model)": residual vector tips.model$call # the call to lm that created this model tips.model$rank # generally p+1 (p=number of variables) tips.model$df.residual # degrees of freedom in residuals, usually N-p-1 tips.model$xlevels # list of levels for categorical predictors tips.model$effects # same as "effects(tips.model)": # successive orth. effects tips.model$terms # same as "terms(tips.model)", # a disection of the formula tips.model$model # the data and (hidden in attributes) model info attributes(tips.model$model) tips.model$qr # the QR decomposition of the design matrix tips.model$assign # ?? # We won't need to understand the second half. # We see, though, that just about everything one needs for regression analysis # is stored away inside a model. # # #### # # INTERCEPTS IN MODEL FORMULAS: # # The intercept term is included by default. Therefore y ~ x # really means y ~ x + 1 # That is, the model is in conventional math notation: # y = a*x + b*1 + error # The intercept term can be removed with either of y ~ x + 0 y ~ x - 1 # Indeed: x <- runif(10); y <- x + rnorm(10) summary(lm(y ~ x + 0)) summary(lm(y ~ x - 1)) # are the same: regressions through the origin, "y = a*x + error". # In general, "-" means removal of a term: y ~ x1 + x2 - x1 # is the same as y ~ x2 # This is most useful combined with '~.': summary(lm(TIP ~ . - SMOKER, tips.dat)) # # #### # # MODEL MATRICES: # # The point of the model formalism is to spare users the explicit # construction of predictor matrices. # Least Squares regression minimizes the residual sum of squares # sum((y - X %*% b)^2) # over all possible coefficient vectors "b". In lin. alg. notation: # RSS(b) = || y-Xb ||^2 # where # "y" is the response vector of size nx1. # "X" is a matrix of size nx(p+1). # It contains constants 1 in the first columns and # the predictor vectors in the subsequent columns. # "b" is the coefficient vector of size "(p+1)x1". # "X" has several names: model matrix (R), predictor matrix, design matrix, # matrix of independent variables, carrier matrix (Tukey),... # # Extraction of model matrix: model.matrix(tips.model) # The first column is indeed a column of ones. # If we force the intercept out, the constant column disappears: x1 <- runif(10); x2 <- runif(10); y <- x1+x2+rnorm(10) model.matrix(lm(y ~ x1 + x2)) model.matrix(lm(y ~ x1 + x2 - 1)) # # Let us check whether the residual sum of squares (RSS) above # is in fact what it should be: y <- tips.dat[,"TIP"] X <- model.matrix(tips.model) # the model matrix b <- coef(tips.model) # the regression coefficients sum((y - X %*% b)^2) # hand-computed RSS # and indeed: deviance(tips.model) # the RSS based on "lm" # # #### # # INTERACTIONS: # x1 <- runif(10); x2 <- runif(10); y <- x1+x2+rnorm(10) # toy data # An interaction model: x1, x2, x1*x2 m1 <- model.matrix(lm(y ~ x1 * x2)) m2 <- model.matrix(lm(y ~ x1 + x2 + x1:x2)) m3 <- model.matrix(lm(y ~ (x1 + x2)^2)) m4 <- model.matrix(lm(y ~ x1 + x2 + I(x1*x2))) # These are all the same: for(i in 1:4) print(cbind(m1[,i], m2[,i], m3[,i], m4[,i])[1:5,]) # #------------------------- end of class 16 --------------------------------------- # CAUTIONS: # # - The following does not produce a polynomial model in x1: model.matrix(lm(y ~ x1 * x1))[1:5,] model.matrix(lm(y ~ x1^2))[1:5,] # This achieves a square as intended: model.matrix(lm(y ~ x1 + I(x1^2)))[1:5,] # 'x*x' and 'x^2' are the same as "x", # which illustrates the difference between the conventional meaning # of "*" and "^" and their meaning in model formulas. # If you want polynomials, use the function 'poly()' which creates # orthogonal polynomial terms of arbitrary degree: x <- 1:100; xpoly <- poly(x,degree=4) plot(range(x), range(xpoly), type="n", xlab="", ylab="") for(i in 1:4) lines(x,xpoly[,i]) # typical look of orthogonal polynomials summary(lm(y ~ poly(x1,3))) # recall y is not related to x1... # # - Recall: The model meanings of +,-,*,^,:,. hold only on the RIGHT SIDE OF "~". # On the response side, they have the mathematical meaning. # # #### # # ANOVA: REGRESSION WITH CATEGORICAL PREDICTORS # # One of the strengths of model formulas is that they give a # compact notation for categorical predictors, # and hence ANOVA (=Analysis of Variance), and accompanying complications. # An artificial data example: x <- factor(sample(c("A","B","C"), length(y), replace=T)) x # So this is a factor (=categorial variable) with levels 6, 7, 8. # Regression on a factor is meaningful: summary(lm(y ~ x)) #---------------- output begin .... Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.355553 0.403206 0.882 0.407 xB 0.001159 0.615907 0.002 0.999 xC 0.639553 0.615907 1.038 0.334 Residual standard error: 0.8064 on 7 degrees of freedom Multiple R-Squared: 0.1585, Adjusted R-squared: -0.08188 F-statistic: 0.6594 on 2 and 7 DF, p-value: 0.5465 #---------------- output end # What happened here? # A look at the model matrix gives the answer: model.matrix(lm(y ~ x)) # "lm" transforms the predictor of type factor into a matrix of # so-called dummy variables, but only for "B" and "C", not "A". # Why not "A"? Because if we introduced three dummies, we'd have # "confounding with the constant". # We explain with a little demonstration: # We force the constant 1 and the all three dummies into a predictor matrix: X <- cbind(1, model.matrix(lm(y ~ x + 0))) X # Now we observe that coefficient vectors such as bu <- c(2,3,4,5) bv <- c(1,4,5,6) # produce the same fits: cbind( X%*%bu, X%*%bv ) # In general, coefficient vectors b = (b0-c, b1+c, b2+c, b3+c) # will produce the same fits for all values of c. # Here are equivalent formulations for this same fact: # - "X" is singular, or equivalently: not of full rank. # - The columns of "X" are collinear. # - The constant is confounded with the dummies. # - The intercept and the coefficients of the dummies are not identifiable. # # If we try to use "X" for regression: summary(lm(y ~ X + 0)) # we find that the last column is effectively removed: #------------------------------- output begin ... Coefficients: (1 not defined because of singularities) Estimate Std. Error t value Pr(>|t|) X 0.9951 0.4656 2.137 0.0699 . XxA -0.6396 0.6159 -1.038 0.3336 XxB -0.6384 0.6584 -0.970 0.3646 XxC NA NA NA NA --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.8064 on 7 degrees of freedom Multiple R-Squared: 0.4587, Adjusted R-squared: 0.2268 F-statistic: 1.978 on 3 and 7 DF, p-value: 0.206 #------------------------------- output end # # How do we interpret the procedure actually used, which removes the # dummy of the first category? model.matrix(lm(y ~ x)) # The answer is that among all possible coefficient vectors # b = (b0-c, b1+c, b2+c, b3+c), # removal of the first dummy amounts to picking 'c' such that b1+c=0. # That is, the coefficients are of the form # b'= ( b0', 0, b2', b3'). # One says that the first category is used as the reference category. # # One can choose 'c' so other interpretations are possible, # as with b0-c=0 instead: # b"= ( 0, b1", b2", b3") # which amounts to removing the intercept term, which is what # the formula "y ~ x + 0" does. # # Moving back and forth between equivalent sets of regression # coefficients in the presence of complete confounding # is called "equivalent reparametrization". #### # # MORE ON REPARAMETRIZATION AND CHANGE OF BASIS IN PREDICTOR SPACE: # # A neat example, not ANOVA, but another type of "equivalent reparametrization": # The example is due to Sam Hui. # Sam wanted to model a 'kink', that is, two linear pieces stitched together. # One way to generate such a kink at location '.2', say, is to use # the following basis functions: summary(lm(y~ pmax(x1-.2,0) + pmax(-x1+.2,0))) #---- Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.3904 0.5113 0.764 0.470 pmax(x1 - 0.2, 0) -0.4160 2.5201 -0.165 0.874 pmax(-x1 + 0.2, 0) 4.2280 4.1596 1.016 0.343 Residual standard error: 0.7732 on 7 degrees of freedom Multiple R-Squared: 0.2265, Adjusted R-squared: 0.005502 F-statistic: 1.025 on 2 and 7 DF, p-value: 0.407 #---- # But this is not convenient for testing whether there is a kink or not. # Better is using the following equivalent basis: summary(lm(y~ x1 + abs(x1-.2))) #---- Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.8548 0.5099 1.676 0.138 x1 -2.3220 1.6358 -1.420 0.199 abs(x1 - 0.2) 1.9060 3.0250 0.630 0.549 Residual standard error: 0.7732 on 7 degrees of freedom Multiple R-Squared: 0.2265, Adjusted R-squared: 0.005502 F-statistic: 1.025 on 2 and 7 DF, p-value: 0.407 #---- # Note that all performance measures are identical! # These models are in fact identical and differ only in a reparametrization. # To see this, note the following: pmax(x1-.2,0)+pmax(-x1+.2,0) == abs(x1-.2) pmax(x1-.2,0)-pmax(-x1+.2,0) == x1-.2 # So, yes, the two pmax functions create the same fits as the abs function # and the raw x1. This identity also shows you how the coefficients # translate from one basis to the other. # # #### # # A REGRESSION PRINCIPLE: ADJUSTED VARIABLES # # Definition: # # Adjusting a variable x2 for another variable x1 means # taking the residuals of the regression # with response x2 and predictor x1. # # The residuals of x2 from the regression on x1 contain # the variation of x2 that is 'not explained by x1'. # # Frequent language: # "there was an/no effect of ... when adjusting for life style" # # Synonymous: # - accounting for ... # - controlling for ... # - correcting for ... # - allowing for ... # - eliminating/taking out/removing the effect of ... # - holding fixed ... # # Example: # In the original Boston Housing data the main interest was in NOX. # We can adjust both NOX and the response MEDV for all other variables: names(boston) # recall variables: NOX is 5, MEDV is 14 nox.adj <- resid(lm(NOX ~ ., boston[,-14])) medv.adj <- resid(lm(MEDV ~ .,boston[,-5])) # The idea is to examine what the 'true relation' is between the variables # of interest by adjusting for effects due to other variables. # # Here are the plots of MEDV vs NOX with and without adjustment: windows(width=7, height=14) par(mfrow=c(2,1), mgp=c(2,.5,0), mar=c(3,3,1,1)) plot(x=nox.adj, y=medv.adj, pch=16, cex=.5) # adjusted lines(smooth(cbind(nox.adj, medv.adj)), lwd=2) plot(boston[,c("NOX","MEDV")], pch=16, cex=.5) # unadjusted for comparison lines(smooth(boston[,c("NOX","MEDV")]), lwd=2) # The first plot is the "adjusted variables plot". # The second plot is the "marginal plot". # # The simple regression coefficient of adjusted variables is: summary(lm(medv.adj ~ nox.adj))$coeff["nox.adj",] #------------------ output begin Estimate Std. Error t value Pr(>|t|) -1.776661e+01 3.773997e+00 -4.707638e+00 3.243214e-06 #------------------ output end # Compare with the NOX coefficient of the full regression: summary(lm(MEDV ~ ., data=boston))$coeff["NOX",] #------------------ output begin Estimate Std. Error t value Pr(>|t|) -1.776661e+01 3.819744e+00 -4.651257e+00 4.245644e-06 #------------------ output end # And compare further with the case when MEDV is not adjusted but NOX is: summary(lm(boston[,"MEDV"] ~ nox.adj))$coeff["nox.adj",] #------------------ output begin Estimate Std. Error t value Pr(>|t|) -17.76661123 7.36820271 -2.41125440 0.01625442 #------------------ output end # # The same coefficient in all three cases!!! (But not the same inference...!) # This is a general principle: # # A MULTIPLE REGRESSION COEFFICIENT IS # THE SIMPLE REGRESSION COEFFICIENT OF # THE RESPONSE ON THE PREDICTOR # ADJUSTED FOR ALL OTHER PREDICTORS. # # If you can't remember this formulation in detail, # you MUST remember at least its corollary: # # THE MULTIPLE REGRESSION COEFFICIENT OF ANY PREDICTOR # DEPENDS ON # WHAT THE OTHER PREDICTORS IN THE REGRESSION ARE. # We now delve into the algebra of adjustment. # Some commonly used notation: # # y.1 = y adjusted for x1, # y.12 = y adjusted for x1 AND x2 in a multiple regression, # x2.1 = x2 adjusted for x1. # # Equivalent notation: y.x = y adjusted for x. Hence: y.1 = y.x1 # The predictors x1 and x2 can be blocks of predictors each: # x1=(x11,x12,x13,...), x2=(x21,x22,x23,...) # The notation is still meaningful, and the following facts still hold. # FACT: # In the decomposition of the response into fit and residual, # y = b1*x1 + b2*x2 + res, (b1, b2 = LS estimates) # res is y adjusted for x1 and x2: res = y.12 # FACT: # (y.1).2 != y.12 # The former is the result of two simple linear regressions, # the latter the result of a bivariate regression onto 2 predictors. # (However, (((((((y.1).2).1).2).1)... converges to y.12: "backfitting") # FACT: # Adjustment is a linear operation. # (y+z).1 = y.1 + z.1 (c*y).1 = c*y.1 # FACT: # x1.1 = 0, x2.2 = 0, x.x = 0, .... # FACT: # One cannot adjust for a predictor more than once. # y.11 = y.1 # y.12 = (y.12).1 = (y.12).2 # res = res.1 = y.121 = res.2 = y.122 = res.12 = y.1212 = ... # (This is the projection property of Least Squares: idempotence.) # FACT: # Adjusting y for a constant means centering y. # x1=c ==> y.1 = y - mean(y) # FACT: # Adjusting y for a categorical variable x means centering y # in each category separately with the category mean. # NOTATION: We write for empirical covariance without centering # and ||y|| for empirical standard deviation without centering # = sum(y*x)/N (ignore details such as .../(N-1)) # ||y||^2 = sum(y^2)/N # This is to distinguish them from theoretical covariance and variance. # # We can express the simple regression coefficient and correlation as # b1 = / ||x||^2 (except for centering) # cor(y,x) = / (||y||*||x||) (except for centering) # FACT: # Residuals are uncorrelated with the predictors: # = 0 # Intuitively: the point of adjustment is to arrive # at something that is free of, that is, uncorrelated with x1. # FACT: # For any two variables y and z: # = = # This is a consequence of the previous fact because # y.1 and z.1 are residuals wrt x1: # y = a*x1 + y.1, z = b*x1 + z.1 # FACT: if y = b1*x1 + b2*x2 + y.12 # then b1 is the simple regression coeff of y.2 regressed on x1.2: # y.2 = b1*x1.2 + y.12 (=> Leverage Plot!!) # FACT: # The simple regression coefficient of y on x2.1 is the # same as that of y.1 on x2.1: # b2 = / ||x2.1||^2 = / ||x2.1||^2 # That is, to obtain the multiple regression coefficient, # one must ADJUST THE PREDICTOR, # while adjusting the response is optional. # FACT: # A regression coefficient of x1 does NOT depend on x2 iff # = 0 ("Orthogonal design") # FACT: # Variance can only decrease under adjustment: # ||y||^2 >= ||y.1||^2 >= ||y.12||^2 >= ... # Normalized by var(y) and subtracted from 1: R^2 # 1 = R^2(y ~ 1) <= R^2(y~x1) <= R^2(y~x1+x2) <= ... # R^2 can only increase as the model increases. # FACT: # If y = b*x + y.x, then # ||y||^2 = ||b* x||^2 + ||y.x||^2 (ANOVA decomposition) # FACT: # stderr(b2) = sigma / ||x2.1|| # where sigma^2 is the variance of the error term. #### # # THE ROLE OF THE ERROR TERM # # We start with some general thoughts on regression fitting. # First, let's take a down-to-earth point of view, best formulated # by George Box (of Box-Cox): # # "ALL MODELS ARE WRONG, BUT SOME ARE USEFUL." # # Take a coin toss for comparison: # We understand the deterministic physics of the process, but it's # useless to describe it with the differential equations of # Newtonian physics; it's more useful to think of it as a # Bernoulli random variable with approximate probability 1/2 # for either outcome. The Bernoulli model is wrong but useful. # (A contributor to the uselessness of physics is chaotic behavior: # small differences in initial conditions lead to drastically # different states down the line.) # # There is a fair degree of rhetoric in Box' formulation, # but there is enough truth in it to live by it. # Statistical models are always wrong because nature or whatever # generates the data does not do it by a formula such as # # y = b0 + b1*x1 + ... + bp*xp + random error # # The reasons: # # * The functional part, b0 + b1*x1 + ... + bp*xp, is an # approximation which can be guessed (i.e., modeled and estimated) # as well as the quality of the data and the sample size permit. # But this kind of linear term is usually not a law in the sense # of physics such as "F = a * m" (or equivalently # "log(F) = log(a) + log(m)" to make it look linear). # The use of linear expressions is sometimes justified by # saying that the true dependence is probably a smooth function # of the predictors, and a linear function may be a reasonable # first order Taylor approximation if the predictor range is # small enough. # # * We think of the "error" as the variability that is not explained # by the functional part. We model the error as random numbers # with a certain structure: centered at zero, preferably symmetric # (i.e., + and - values of the same magnitude are equally likely), # hopefully even approximately normal. # But in a strict sense this is nonsense: # "Error" stems from missing predictors that may be hard to # observe or too complex to disentagle. # # In most data situations in which we use regression, # we may have only a dubious idea of what information is missing, # or what the complexities are that we are lumping into the error term. # But we hope that the missing information is composed of lots of # small contributions (small compared to the predictors we have in hand), # each too complex and each too piddly to account for. # The collective effect of these contributions would then add up to # something that may well be described as normally distributed, # using the central limit theorem as a guiding heuristic. # With these considerations in mind, we approach the question of # regression diagnostics. Diagnostics are techniques that establish # whether # # A MODEL IS ADEQUATE. # # Note that we avoid the term "correct". There is rarely a "correct" model, # although most of statistical theory assumes it. # # The importance of model adequacy is that without it, # statistical inference is distorted, biased, or plain invalid. # # [A dilemma arises: # As we will see, diagnostics require a certain amount of data snooping, # which in turn may invalidate statistical inference in the strict sense. # This is because diagnostics usually lead to improvements in the model, # but, having derived the improved model by looking at the data, # it is then impossible to take statistical inference at face value. # Do you see why this is a problem for statistical inference? # Here is why: # Statistical inference is the assertion of probabilistic statements # about data, based on certain modeling assumptions. # So what does it mean, for example, that a predictor is significant # at the 5% level? # As always, you should think in terms of hypothetical replications # of our data situation: if 1,000,000 independent researchers were # each faced with data like ours, and if the predictor were worthless, # fewer than 50,000 would conclude that the predictor is important. # Recall, though, that the regression coefficient of the predictor # depends on what the other predictor in the model are. # The problem is that after improving a model by looking at the data, # many of the million researchers might not arrive at our improved model # because their model improvements (if any) might look different from ours. # Consequently their test results would be different from ours and # most likely not even comparable to ours. # This is when it is time to give up on the idea of statistical inference # and think of significance levels and p-values as guideposts, # as opposed to probabilistic conclusions. # Note that this does not mean that we should not perform diagnostics: # without diagnostics we'd be performing statistical inference based # on flawed assumptions; we just wouldn't know about it. # As John Tukey once said, in a rhetorical formulation comparable to # George Box': # # IT IS BETTER TO BE APPROXIMATELY RIGHT THAN EXACTLY WRONG. # # It is better to produce diagnosed guideposts than illusory inference.] # # #### # # REGRESSION DIAGNOSTICS 1: QUALITY OF FITS -- MORE GENERALITIES # # Following the preceding considerations, we will say that the quality of # a regression fit is unsatifactory if there is evidence that the errors # are NOT due to many small additive contributions. # Because many small additive contributions would have to add up to # approximately normally distributed errors (central limit theorem, c.l.t.), # we can take any evidence of non-normal errors as indication of # the existence of a not-so-small contribution that is not part of the model. # One goal of regression diagnostics is pinning down the nature of # # LARGE CONTRIBUTIONS TO THE ERROR TERM # # Large unexplained contributions can take several forms, and some of # them are more easily detected and remedied than others. # Here are the easy forms first: # # 1) Inadequate functional form of the systematic part of the model: # The available predictors are adequate but the systematic effects # are not linear in the predictors: # a) A nonlinear transformation of the response and/or the # predictors is necessary to achieve a good fit. # b) Some interaction terms should be added, for example, some products. # c) The model assumes smoothness but the actual response surface # has kinks or even jumps. # Overall, this type of problem can be solved with sufficient # data analytic and modeling skills based on # - graphical examination of residuals and # - trying out many models with promising transformations # and additional model terms. # # The following are more problematic forms of inadequate fit because # the predictors do not contain the information for a good fit: # # 2) ROBUSTNESS: # A majority of the data might be reasonably modeled with the # available predictors, but there's a minority of points that can't be fit. # Technical solution: "robust regression". # A method is robust if a minority of outlying points # does not upset a good fit to the majority (e.g., median vs mean). # Robustness is usually achieved by replacing the least squares criterion # with other criteria that are less sensitive to large residuals. # An example is L1 regression which minimizes the sum of absolute residuals: # L1(b) = sum(abs(y-X%*%b)) # as opposed L2 (= least squares) regression which minimizes # the sum of squared residuals: # L2(b) = sum((y-X%*%b)^2) # A family of intermediate regression estimators is given by Huber's # M-estimators which use squares for small residuals (abs(r)=threshold). # Lhub(b) = sum(ifelse(abs(y-X%*%b)<1, (y-X%*%b)^2, 2*abs(y-X%*%b)) # The threshold parameter (here chosen as 1) determines the degree of # robustness: # threshold = infinity -> L2 or least squares, least robust # threshold = zero -> L1, most robust # This form of robustnes protects only against outlying residuals; # additional robustness is available against outlying predictors, # but this gets complicated. # # Robust regression might isolate poorly modeled subsets of the data. # If the subset is clearly identified and well-understood, # an indicator variable for the subset in the present model or # a separate model might be a solution. # # Still more problematic is the following situation because there # are few technical solutions: # # 3) A MISSING MAJOR PREDICTOR: # The problem is that the missing predictor might show its effect # spread out over all the data, which means that it will be impossible # to confine the lack of fit to a subset of the data. # Still, there occasionally exists additional information that # is available but has not been included in the model, and # this information may have clues about what is missing in the model. # Two examples: # # - If the data is collected in time order, and if there exists # a strong missing predictor that varies slowly over time, # we would see correlations between observations that are # adjacent in time. # One could then use the time order of the data as an additional # predictor; it would be a proxy for the missing predictor. # It would help, though, if there existed an explanation for the # missing predictor. # # - If the data has a spatial aspect, as in the Boston Housing data, # one should check for spatial coherence, that is, slow spatial # variation. Like time order in the previous example, # spatial order could be a proxy for unaccounted predictors. # # An example of time order diagnostics will be given below. #### # # REGRESSION DIAGNOSTICS 2: QUALITY OF FITS -- RESIDUAL PLOTS # # We use a plain linear model for the Boston Housing data for illustration: boston.lm <- lm(MEDV ~ ., data=boston) summary(boston.lm) #--------------------------- output begin Estimate Std. Error t value Pr(>|t|) (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 *** CRIM -1.080e-01 3.286e-02 -3.287 0.001087 ** ZN 4.642e-02 1.373e-02 3.382 0.000778 *** INDUS 2.056e-02 6.150e-02 0.334 0.738288 CHAS 2.687e+00 8.616e-01 3.118 0.001925 ** NOX -1.777e+01 3.820e+00 -4.651 4.25e-06 *** RM 3.810e+00 4.179e-01 9.116 < 2e-16 *** AGE 6.922e-04 1.321e-02 0.052 0.958229 DIS -1.476e+00 1.995e-01 -7.398 6.01e-13 *** RAD 3.060e-01 6.635e-02 4.613 5.07e-06 *** TAX -1.233e-02 3.760e-03 -3.280 0.001112 ** PTRATIO -9.527e-01 1.308e-01 -7.283 1.31e-12 *** B 9.312e-03 2.686e-03 3.467 0.000573 *** LSTAT -5.248e-01 5.072e-02 -10.347 < 2e-16 *** --- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 _ 1 Residual standard error: 4.745 on 492 degrees of freedom Multiple R-Squared: 0.7406, Adjusted R-squared: 0.7338 F-statistic: 108.1 on 13 and 492 DF, p-value: 0 #--------------------------- output end # # R provides four standard diagnostics plots in one function: windows(width=12, height=12) par(mfrow=c(2,2), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) plot(boston.lm, pch=16, cex=1, cex.lab=1.5, cex.id=1) # There are some niceties: extreme cases are labeled, such as here # some census tracts in Boston proper who just don't fit well. # # (1) The top left frame shows raw residuals plotted against fitted values. # Q: Why plotting against the fitted values? # A: The vector of fitted values is uncorrelated with the residual vector, # so we should see an unstructured plot. If we see structure, # we can describe it in terms of high and low estimated response values. # Suspicious: The group of points on the right that nearly fall # on a straight line, shooting up toward Boston: # clearly a subset one should single out for examination. # I seem to remember that Boston Backbay and Beacon Hill are part of them, # a ritzy area where homes are small but expensive. # The data do not account for this effect: we have here an example of # a MISSING PREDICTOR, but one that can be traced to a peculiar subset. # The misfit of the few Boston proper census tracts is so severe that # the majority of the residuals is arranged in a crescent pattern below # and around the misfits.. # # (2) The top right frame shows a q-q plot of the standardized residuals. # If the fit were good, the points would fall near a straight line. # As it stands, the residuals are skewed with a heavy tail for # positive values. # # (3) The bottom left frame shows horizontally the fits, as in the top # left frame, but the vertical axis shows a transformation of the raw # residuals. The transformation steps are as follows: # 1. Standardization to equal variance: # we haven't discussed variance/covariance properties of residuals; # let's just note that residuals generally do not have equal variance, # but that the variance is known analytically, # hence standardization is possible. # 2. Studentization: dividing by the unbiased estimate "s" of the error stdev. # 3. Taking absolute values of the standardized, studentized residuals: # The point is to examine the residuals for "heteroscedasticity", # that is, non-constant error variance. # 4. Taking square roots of 3.: according to "help(plot.lm)" this reduces # the skewness of the absolute residuals. # Unfortunately, authors do not agree on what to call the residuals after # standardization to equal variance and division by the error sdev. # The authors of the R function call them "standardized residuals", # but Myers (sec. 5.3) calls them "studentized residuals". # # More important is an issue of graphical presentation: # It would be better to draw a smooth through this plot in order to see # whether the variance is constant from left to right. # If you download the file "plot-lm.R" from the STAT-541 site and say source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/plot-lm.R") source("functions/plot-lm.R") # you will have a modification of the function "plot.lm" that draws a smooth: windows(width=12, height=12) par(mfrow=c(2,2), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) plot.lm.x(boston.lm, pch=16, cex=.5, cex.lab=1, cex.id=.8) # The picture simply confirms the crescent pattern caused by the odd census # tracts in Boston proper; it would be mistake, though, to interpret this # as "heteroscedasticity". (Why?) # # For the Boston Housing data I prefer the raw residuals because # they show best the problems with the odd census tracts in Boston proper. # # (4) Finally, the bottom right frame shows a plot of the so-called Cook # distances, measures of outlyingness for each case in predictor space; see help(cooks.distance) # It belongs among influence diagnostics (see below). # # At any rate, the three relevant plots all indicate that the fits # leave something to be desired. # #### # # Missing diagnostic: plot of residuals versus order of the data # Why? Order is often a proxy for missing predictors. # For example, data are often collected in a time or spatial order, # and when you hear "time" or "space" you should always think "trend"... # Examples: # - baking experiment at a bread factory: # interest is in the effect of ingredients, # but back-front placement in the oven mattered # also unbeknownst to experimentors # - refinery: # presence of a specific employee prevented production problems # - collection of data in a survey: # hidden factors could be # interviews morning vs evening # interviewer # # Such "lurking" variables can often be detected with a # plot of residuals against order. # #### # # The problem with the above diagnostics plots is that we don't have a # standard to compare them with: # - are the present plots acceptable? # - what would the residual plots look like if the model were adequate # and the errors were normally distributed? # To answer these questions, I propose "null comparisons" for the plots: # we generate nice looking residuals that would be obtained # if the model were true, that is, the data were of the form # # model + normally distributed errors. # # This can be done with four R functions in the file "residual-comparison.R" # which you should download and source into R: source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/residual-comparison.R") source("functions/residual-comparison.R") # The first function, "null.residuals(model.lm)", constructs residuals by # fitting the model to normal random numbers and: # # scaling the residual vector to the same overall length # as the residual vector of the actual data. # # (Think about why we don't have to play parametric bootstrap for these # pictures, that is, why we don't have to generate a fit, patch on # normal errors; why instead we can fit the errors w/o the fits.) # # We then show # 1) residuals versus fits, # 2) residuals versus normal quantiles, and # 3) residuals versus order (see above regarding missing predictors). # # The three functions generate these plots for the actual data # AND for 5 sets of well-behaved random residuals. # We set up a sufficiently large window and place the residual plots in it: res.vs.fits(boston.lm) # execute one statement at a time res.vs.qnorm(boston.lm) res.vs.order(boston.lm) # The actual residuals are shown in the last plot, for a reason: # after plotting, each function goes into an input loop of the # "identify()" function to permit interactive labeling of points. # This is more flexible than the automatic labeling of "plot.lm()". #---------------------- end of class 17 ------------------------------------------ # # The plots confirm that the fits leave something to be desired. # The comparisons with well-behaved residuals add an inferential # dimension to the diagnostic plots. We can now learn what is truly # odd in the regression fit and what is within the natural variability # for a given dataset. # # It seems to me the problem with fitting linear models to the Boston # Housing data is not so much with fitting the right equation, # but with the heterogeneity of the data; # the data have a clear story to tell (see the analysis based on trees), # but they are really not the stuff for linear models. # # A diagnostic of particular interest is the plot of residuals vs order: # the order seems to be correlated with space/geography, # most likely nearby census tracts tend to be nearby in the data order. # This indicates that the predictors are highly "incomplete" in the # sense that it would be possible to find more predictors that also # plausibly affect housing values. # #### # # Literature: # A detailed analysis of regression fits to the Boston Housing data is in # the classic book by Belsley, Kuh, and Welsch that started the topic of # regression diagnostics. # # #### # # REGRESSION DIAGNOSTICS 3: INFLUENCE # # "Influence" refers to the strength with which a case or group of cases # determines fits and regression coefficients. # There exist many measures of influence. A simple one is "self-influence", # or the strength with which a case determines its own fit. # The quantities that measure self-influence are the diagonal entries # of the so-called "hat matrix" H = X (X^t X)^(-1) X^t # (one of Tukey's many coinages: the matrix that puts the "hat" on y). # In R language: # yhat <- H %*% y # where # H <- X %*% solve(t(X) %*% X) %*% t(X) # (although matrix inversion (with "solve()") is not the recommended way to # compute least squares fitting). # For concreteness, we compute these things for once by hand: X <- cbind(1,as.matrix(boston[,-14])) y <-boston[,"MEDV"] N <- nrow(X); p1 <- ncol(X) b <- solve(t(X) %*% X) %*% t(X) %*% y H <- X %*% solve(t(X) %*% X) %*% t(X) # hat matrix, orth projection in N-space A <- diag(N) - H # adjustment operator, creates residuals yhat <- H %*% y res <- A %*% y # (Again, this is NOT how good software computes LS! It uses a Q-R decomp., e.g.) # Compare with the results of 'lm()': cbind(resid(boston.lm),res)[1:5,] cbind(fitted(boston.lm),yhat)[1:5,] cbind(coef(boston.lm),b) # Identical. So we 'proved' the textbook formulas to be true for the # Boston Housing data. You can also 'prove' properties such as # symmetry: max(abs(H - t(H))) # idempotence: max(abs(H - H%*%H)) max(abs(A - A%*%A)) # orthogonality of the spaces of fits and residuals: max(abs(A %*% H)) # the dimensionality of the space of fits = degrees of freedom: sum(diag(H)) # dim of space of fits = trace of H sum(diag(A)) # dim of space of residuals = trace of A # # The main point is to get an understanding of self-influence: i <- 10 c(yhat[i], H[,i] %*% y) # The element H[i,i] reflects how much y[i] determines yhat[i]. # Therefore: # # diag(H) contains measures of self-influence of all points. # # These values can be obtained with 'lm.influence()': cbind(lm.influence(boston.lm)$hat, diag(H))[1:5,] # See help(lm.influence) names(lm.influence(boston.lm)) # We plot the diagonal of the hat matrix against all predictors: windows(width=16, height=10) par(mfrow=c(3,5), mgp=c(1.5,0.5,0), mar=c(2.5,2,1,1)) for(i in 1:14) plot(boston[,i], diag(H), cex=0.5, pch=16, xlab=names(boston)[i], ylab="Self-Influence") # We see that high Crime has a number of points with high self-influence. # What are these points? rownames(boston)[boston[,"CRIM"]>35] #------------------------------ output begin [1] "381__Boston" "399__Boston" "405__Boston" "406__Boston" "411__Boston" [6] "415__Boston" "419__Boston" "428__Boston" #------------------------------ output end # Boston proper, but no further details. # # R has a whole suite of functions for influence diagnostics. # Look up the one help page for all of "influence.measures()", "rstandard()", # "rstudent()", "dffits()", "dfbetas()", "covratio()", "cooks.distance()". # # Another influence diagnostic is the adjusted variables plot or # partial residual plot or leverage plot: source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/leverage-plots.R") source("functions/leverage-plots.R") leverage.plots(lm(MEDV ~ ., data=boston), cex=.5) leverage.plots(lm(MEDV ~ ., data=boston), cex=.5, ident=1:13) # null residual comparison, one per predictor, similar to res.vs.predictors()] # # We had looked at this earlier when we talked about adjustment: # # The slope we see is what the multiple regression sees!! # # These are influence plots in the sense that they show what observations # have strong influence on a given predictor. # The first plot for CRIME shows that the crime coefficient is determined # by a handful of census tracts. Surprisingly the coefficient for LSTAT, # which we know is important, also has a leverage problem. # If we were serious, we'd want to find out what the points on the left # of that plot are. # IMPORTANT: Influence is on a per-predictor basis # A point is influential for the slope of a specific predictor, # but this same slope is in general not influential for other slopes. # Here is with the identification option to identify leverage points leverage.plots(lm(MEDV ~ ., data=boston), cex=.5, ident=1:13) #### # # REGRESSION DIAGNOSTICS 4: TRANSFORMATIONS # # Often an unsatisfactory fit can be improved with some simple transformations # of the response and/or the predictors. The Box-Cox transformation # was indeed invented for response transformation. Box and Cox # estimate the power parameter with Maximum Likelihood, but that has # been shown to be problematic: the parameter is not well "identifiable", # that is, a large range of parameter values often give an almost # indistinguishable quality of the fits. The recommendation is then # to just try a few powers with relatively round numbers, such as: # 1 which leaves the response untransformed, # 2 for convex transformation, # 0 for the log, the most frequent concave transformation, # 0.5 in case the log was too strong # -1 in case the log was too weak. # It is more satisfactory to see one of these values in a fitted equation # than a power -0.8766, for example, which would be hard to justify to # an inquisitive audience. Here is an example of stepping through the # powers 1.0, 0.5, 0.0, -0.5, and -1.0: for(pow in c(1.0, 0.5, 0.0, -0.5, -1.0)) cat("power =",pow,":\n R^2 =", summary(lm(box.cox(MEDV, pow) ~ ., data=boston))["r.squared"][[1]],"\n") #--------------------------- output start power = 1 : R^2 = 0.7406427 power = 0.5 : R^2 = 0.7757701 power = 0 : R^2 = 0.7896407 power = -0.5 : R^2 = 0.7743901 power = -1 : R^2 = 0.7286592 #--------------------------- output end # So we'd choose the log with R^2 values of 0.79 (raw). # which is a 5% improvement over 0.74 for the untransformed data: boston.log.lm <- lm(log(MEDV) ~., data=boston) # So much for response transformation: one just tries a few values. # (Reminder: box.cox works only for responses with positive values.) # For predictor transformations, one could use plots of residuals versus # predictors to get a sense of whether a nonlinear transformation is needed. # Unfortunately, this is not a powerful method: residuals from linear # regression tend to hide nonlinearities. # As with previous residual plots, we have the problem of not knowing # what is odd and what is not. Again, a comparison with well-behaved # residuals helps. In the same file, "residual-comparison.R", # there is also a function for plotting raw residuals vs. all predictors, # paired with similar plots of well-behaved residuals: source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/residual-comparison.R") res.vs.predictors(boston.log.lm, preds=colnames(boston)[1:7]) for(i in 1:100) # with null animation if you like res.vs.predictors(boston.log.lm, preds=colnames(boston)[1:7], new=F) # same for the remaining predictors: res.vs.predictors(boston.log.lm, preds=colnames(boston)[8:13]) for(i in 1:100) # with null animation if you like res.vs.predictors(boston.log.lm, preds=colnames(boston)[8:13], new=F) # These plots make it clear that the large residuals are strongly clustered # on specific values of several predictors. One should now investigate # these subsets. In particular it seems that the ties in RAD and TAX # have more variability; since we now know that these are mostly Boston proper # it appears that the data does not have sufficient predictor information # to adjust for differences within Boston proper. # # As for nonlinearities, only RM might suggest a transformation, # but it would not be a standard Box-Cox transform, more something # that transforms linearly on both ends but has a kink in the middle. # This would go well with what we know from previous explorations: # for small values of RM there is little assocation with MEDV, but # for large values of RM there is. # [Note on the literature: # Some authors prefer the "components plus residual" plot, where the # "component" bi*xi is added to the residual on the vertical axis, # but I find this a less successful plot because the slope obscures # the curvatures somewhat.] #### # # THE ACE METHOD FOR NONLINEAR TRANSFORMATION # # Instead of going through a tiresome search for nonlinear transformations # one at a time with the aid of graphics, here is a big hammer # of a very different kind to solve the problem of nonlinear transformations: # the ACE method by Breiman and Friedman (JASA, 1985). # B&F propose to estimate transformations of the predictors and the response # simultaneously in one single algorithm. In effect, they maximize # cor(g(Y), sum_i fi(Xi)) # over all possible smooth transformation "g" of the response Y and # "fi" of the predictors Xi. Their ACE algorithm optimizes one transformation # at a time but then cycles through the transformations: # maximize in turn g, f1, f2,... fp, g, f1, f2,... fp, g,... # # In order to get at the code for ACE, you have to download "acepack" from # the R-CRAN site (http://cran.r-project.org/) and install it. You can do # that with the following function call WHILE CONNECTED TO THE INTERNET: http://cran.us.r-project.org/ # click "Packages", search for "acepack", # download to local folder install.packages(pkgs="acepack_1.3-2.2.zip", repos=NULL) # call once for all # This has to be done once only. # However, you still have to call library(acepack) # in every session once before you use the functions in the package. # Then you can look at help(ace) # Here is an example of the application of ACE to the Boston Housing data: boston.ace <- ace(x=boston[,-14], y=boston[,14]) # The code is old and predates the formula language in R/S, which is why you # enter oldfashioned x and y arguments. # The returned data structure has the transformation evaluated at the # observed predictor values: names(boston.ace) boston.ace$rsq # The original data are in ...$x and ...$y while their transformations # are in ...$tx and ...$ty. # Here is a layout of the plots of the transformations: windows(width=3*3, height=5*3) par(mfrow=c(5,3), mgp=c(1.5,0.5,0), mar=c(2.5,2,1,1)) ylim <- range(boston.ace$tx) # shared range of the fi(Xi) plot(boston.ace$y, boston.ace$ty, # plot g(Y) cex=.7, pch=16, xlab="MEDV", ylab="g(MEDV)") for(i in 1:13) plot(boston.ace$x[i,], boston.ace$tx[,i], cex=.7, # plot fi(Xi) ylim=ylim, pch=16, xlab=dimnames(boston.ace$tx)[[2]][i], ylab="") # (Note the idiosyncrasy of storing x as pxn but tx as nxp.) # We note that the response is untransformed for all practical purposes. # Among the predictor transforms we note that RM and LSTAT are dominant: # - RM has a curious transform: it seems to dip first, but only slightly, # which is why it probably wants to stay flat first and then to shoot up # starting with about 6.5 rooms. # - LSTAT has a convex transform, which we might want to model with a # quadratic transformation. # A good use of ACE is to read off an analytic form of a transform for a # variable, and then revert to linear modeling with the analytically # transformed variables. For example: summary(lm(MEDV ~ CRIM + ZN + INDUS + CHAS + NOX + I(NOX^2) + I(pmax(RM-6.5,0)) + DIS + RAD + TAX + PTRATIO + B + log(LSTAT), data=boston)) # as opposed to: summary(lm(MEDV ~ ., data=boston)) # The most creative transformation is for RM (average number of rooms): # f(RM) = max(RM-6.5,0), which is a kink function: zero up to 6.5 and # linear therafter. The ACE plot would suggest a slightly negative # slope for RM<6.5, but I thought this might be spurious. The # kink function has a clear interpretation: at the level of aggregation # of census tracts, the average number of rooms RM does not matter for # RM small; only for RM>6.5 do we see a linear effect. That is, # if the houses are small on the average, the median house price is not # affected by the degree of smallness; but if the houses are larger on # the average, the degree of largeness matters. In census tracts with # small houses, everything else matters more: LSTAT above all. # This is compatible with the story we obtained from tree regression earlier. # # Besides a kink transformation of RM, I added quadratic transformations # of NOX and log of LSTAT, again based on the ACE plot. It turns out that # NOX^2 is insignificant. I was first suspicious that NOX and NOX^2 # are too collinear (correlated) with each other, which might mask their # joint explanatory power. I decorrelated NOX^2 from NOX by subtracting # the mean: I((NOX-mean(NOX))^2), but it didn't help, still insignificant, # so I drop this term in the next model. [Btw, why does subtracting the # mean have a decorrelation effect? Compare plot(NOX^2 ~ NOX, data=boston, pch=16) plot((NOX-mean(NOX))^2 ~ NOX, data=boston, pch=16) nox <-boston[,"NOX"]; cor(nox, nox^2); cor(nox, (nox-mean(nox))^2) # # The linear model fitted above would suggest removing a few variables: # ZN, INDUS, NOX^2. summary(lm(MEDV ~ CRIM + CHAS + NOX + I(pmax(RM-6.5,0)) + DIS + RAD + TAX + PTRATIO + B + log(LSTAT), data=boston)) # Indeed, with adjusted R^2=0.82, we're pretty well off. # And the transforms of RM and LSTAT being the most significant, # we have come close to the story that we induced from tree regression. # Back to diagnostics of the quality of fits: windows(width=12, height=12) par(mfrow=c(2,2), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) plot(lm(MEDV ~ CRIM + CHAS + NOX + I(pmax(RM-6.5,0)) + DIS + RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data=boston), pch=16, cex=.5, cex.lab=1.2, cex.id=1.2) # Oh well, we still have the problem of Boston proper; # the transformations improved the fit but did not solve the # Boston problem. #### # # MODEL SELECTION # # In the previous example we stepped up and down a few times # in the process of modeling: diagnostics lead to transformations and # additional terms (squares, for example), while findings of # statistical insignificance lead to elimination of terms. # This process can be helped with tools for searching # subsets of predictors for strong explanatory power. # The standard tools that are part of R are two functions, # "add1()" and "drop1()", which do just what they say: # tell us what happens if one adds one term or drops one term # at a time. drop1(lm(MEDV ~ CRIM + CHAS + NOX + I(pmax(RM-6.5,0)) + DIS + RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data=boston), test="F") # "drop1()" goes thru all available variables and shows # what would happen if each were dropped. # The criteria are "Sum of sq" of the dropped term, # the RSS after dropping a term, and the so-called AIC # (Akaike's criterion). The latter is something like # the mean squared error plus a penalty for the number # of fitted degrees of freedom. # A bigger hammer for searching among models is in the library # "leaps" which you can download and install from the site # http://cran.r-project.org/ # As with "acepack", you can get this done with the following # function call WHILE YOUR COMPUTER IS CONNECTED TO THE INTERNET: http://cran.us.r-project.org/ # click "Packages", search for "leaps", # download to local folder install.packages(pkgs="leaps_2.7.zip", repos=NULL) # call once for all library(leaps) # needs to be called in every session # The name of the library is taken from the "leaps-and-bound" # algorithm that is used in the search for subsets of predictors. # Again, in every R session you have to call the library function # before using anything in the package: # The function 'regsubsets()' finds the best subset of predictors # for every size. Optional arguments: # nbest=2 will find the two best for each size # nvmax=13 will find them for all sizes up to 13 summary(regsubsets(log(MEDV) ~., data=boston, nbest=2, nvmax=13, force.in=c(6,13))) # There is also a graphical way of looking at these models: windows(width=6, height=12) plot(regsubsets(MEDV ~., data=boston, nbest=2, nvmax=13), scale="adjr2") plot(regsubsets(MEDV ~. - RM - LSTAT + I(pmax(RM-6.5,0)) + log(LSTAT), data=boston, nbest=3, nvmax=13), scale="adjr2") # # All the above still doesn't answer how high we can reasonably go with # the number of predictors in the model. # Here is a helpful trick: add a few random number predictors to the model # and check whether the regressions get trapped in them. nnoise <- 10 # the number of random predictors we add boston.ext.dat <- # the data frame of actual and random data data.frame(boston, as.data.frame(matrix(rnorm(506*nnoise), ncol=nnoise))) # Here is the extended model: summary(lm(log(MEDV) ~ ., data=boston.ext.dat)) # Fortunately most random predictors are recognized as worthless, # but occasionally there's one that sticks out by chance. boston.ext.sub <- regsubsets(nvmax=20, nbest=2, log(MEDV) ~. - RM + I(pmax(RM-6.5,0)) - LSTAT + log(LSTAT), data=boston.ext.dat) summary(boston.ext.sub) plot(boston.ext.sub, scale="adjr2") # Again, it's comforting to see that only in models of size 10 # do we see the first noise predictors move in. # So one might conclude that models of size up to 9 would be ok. # I still find this optimistic, but this isn't randomly sampled data, # so there is no way to tell what natural variability is. # Come to think of it, this is highly aggregate data of census tracts, # so maybe the signal is strong enough that fitting 9 predictors is # not that unreasonable. ################################################ # # INTERPRETATION OF REGRESSIONS # # - paper by Brian Joiner on missing variables # - Mosteller & Tukey: WW2 examples of proxy interpretation # . low/high flying bombers and precision of bombing # . flak wholes in bombers (A. Wald) # - Mosteller & Tukey: example of multiple predictors (collinearity) # #### # # REGRESSION DIAGNOSTICS 4: COLLINEARITY # # Collinearity is the problem posed by correlated (partly confounded) # predictors. Correlated predictors make it hard to sort out the # impact of each predictor on the response. # To study collinearity and its effects, let's make up a toy example. # We start with two predictor vectors x1 and xx that are constructed # to be exactly uncorrelated and have exactly unit variance: n <- 100 x1 <- scale(rnorm(n)) xx <- scale(resid(lm(rnorm(n) ~ x1))) cor(x1,xx) # Response = sum of predictors + error: y <- x1 + xx + rnorm(n,sd=1/4) # So by construction the true coefficients are both +1. # Regression finds them quite well: coef(lm(y ~ x1 + xx)) # Because of complete absence of correlation among predictors, # the multiple regression coefficients are the same as the # simple regression coefficients: coef(lm(y ~ x1)) coef(lm(y ~ xx)) # But this is only the raw material for constructing a family of # 'collinear' data: a <- pi/10 cos(a) x2 <- cos(a)*x1 + sin(a)*xx cor(x1,x2) # The cosine is exactly the correlation. # This holds for all angles a; it's an analytic fact. # At any rate, we now have an intuitive way of constructing # predictors x1 and x2 with prescribed correlation # (= prescribed "degree of collinearity"). # # Here is a simulation in which we collect the t-statistics for # various degrees of collinearity as measured by cor(x1,x2). # R^2 is held at roughly 0.5. nsim <- 100 # number of simulations angs <- seq(pi/2,pi/50,length=5) # list of angles for correlations res.coll <- # results: t-stat of x1 and x2, and R^2 array(NA, dim=c(t1.2.R=3,sim=nsim,cor=length(angs))) for(isim in 1:nsim) { err <- scale(rnorm(n)) # an error vector, scaled to var=1 for(iang in 1:length(angs)) { ang <- angs[iang] # the current correlation x2 <- cos(ang)*x1 + sin(ang)*xx # rotate x2 toward x1 sd <- sd(x1+x2) y <- x1 + x2 + err*sd # sd needed to keep R^2 fixed at 0.5 sy <- summary(lm(y ~ x1 + x2)) res.coll[1:2,isim,iang] <- sy$coefficients[c("x1","x2"),"t value"] res.coll[3,isim,iang] <- sy$r.squared } print(isim) } # Here are the histograms of the simulated t-statistics # for each correlation: windows(width=6, height=13.5) par(mfrow=c(length(angs),1), mar=c(2,0,2,0)) for(iang in 1:length(angs)) hist(res.coll[1:2,,iang], breaks=20, col="grey", xlim=c(-3,10), main=paste("cor =",round(cos(angs[iang]),3), " ang =",round(angs[iang]/pi*2*90,3))) # We see that the t-statistics move from extreme significance # to insignificant as the collinearity increases, # while R^2 keeps hovering around 0.5. # If we take the value 2 as a rough dividing line between small and # large t-values, it is informative to see what fraction is below 2: cbind(corr=round(cos(angs),3), perc.below.2=apply(res.coll[1:2,,], 3, function(x) round(sum(x<2))/nsim/2*100)) #---------------------- output begin corr perc.below.2 [1,] 0.000 0.0 [2,] 0.368 0.0 [3,] 0.685 1.0 [4,] 0.905 45.0 [5,] 0.998 97.5 #---------------------- output end # For corr=0.905, we have about 45% of the t-values below 2, # for corr=998, it is almost all. # # This may not seem terrible: who sees ever a correlation of 0.905? # If all we did was regression onto two predictors, we'd not be worried. # But the aggrevating factor is that collinearity with more than two # predictors can occur much more easily: with x1, x2 and x3, # we have a collinearity problem not only when two predictors # are close, but also when for example x1 is close to x2+x3. Why? # Here is an argument that leads up to the general reason: # If x1 is close to x2 + x3, this implies - x1 + x2 + x3 ~= 0 # (where we use "~=" to mean "approximately equal"). # More generally we argue that the existence of any # nearly vanishing linear combination is trouble: # # 0 ~= a0 + a1*x1 + a2*x2 + a3*x3 # # Now, if the least squares fit is # # y ~= b0 + b1*x1 + b2*x2 + b3*x3 # # then the following will also be pretty good fits: # # y ~= (b0+c*a0) + (b1+c*a1)*x1 + (b2+c*a2)*x2 + (b3+c*a3)*x3 # # for values of "c" that are not too extreme. # That is, the fit is about the same for all coefficient vectors # # (b0+c*a0, b1+c*a1, b2+c*a2, b3+c*a3) (c small) # # # Question: Does there exist a method for finding approximately # vanishing linear combinations in predictor space? # Answer: Yes, the method is called "principal components". # # #### # # DETECTION OF COLLINEARITIES WITH PRINCIPAL COMPONENTS: # # Before we approach the technicalities, we have to solve the problem # of scale or units in predictor space: # If some predictors have small numbers and others large numbers due # to the scale on which they are measured (oz, lbs, tons, in, miles,...), # we don't really know what it means for a linear combination of predictors # to be "small" (the apple-and-oranges problem). # # To circumvent the issue, one can standardize the predictors, that is, # one subtracts the mean and divides by the sdev (in R: "scale()"). # For example: boston.std <- as.data.frame(scale(boston)) # scale() returns a matrix # But we probably want to keep the response "MEDV" as is, so we restore it: boston.std[,"MEDV"] <-boston[,"MEDV"] # # The regression coefficients in the standardized "boston.std" will # of course differ from those in the raw "boston": # The unit of measurement in "boston.std" is the standard deviation # of each predictor. Hence the regression coefficient "bi" now expresses # # "average change in the response # per standard deviation of change in xi." # # In the raw data, it expressed # # "average change in the response # per unit (lb, mile,...) of change in xi". # # After standardization, all predictors are on an equal footing, # and it now makes sense to ask what linear combinations of the # predictors are small, as long as the coefficients "ai" are held # at a constant length, for example: # # a1^2 + a2^2 +...+ ap^2 = 1. # # If there does not exist any degree of collinearity, that is, # if the predictors are strictly uncorrelated: cov(xi,xj) = 0, # the linear combination has variance 1: # # var(a1*x1 +...+ ap*xp) = 1 # # (Can you do the calculation? It's just another Pythagoras.) # # Now we are ready: # # Principal component analysis (PCA) finds linear combinations whose # variance is either very large or very small (and everything inbetween). # # We will use the small variance solutions for collinearity detection. # While there exists a routine for PCA in the library "mva", we can # more easily get them in the form we need with the R function in the # file "collinearity-pca.R" which you should download and source into R: source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/collinearity-pca.R") # Here is the application to the standardized predictors of # the Boston Housing data: # collinearity.pca(boston.std[,-14], nlin=13) #-------------------------- output begin $variances PC13 PC12 PC11 0.064 0.169 0.186 $coefficients PC13 PC12 PC11 CRIM 0.046 -0.087 -0.110 ZN -0.081 0.071 0.263 INDUS -0.251 0.113 -0.303 CHAS 0.036 0.004 0.014 NOX 0.044 -0.804 0.111 RM 0.046 -0.153 0.053 AGE -0.039 0.212 -0.459 DIS -0.018 -0.391 -0.696 RAD -0.633 0.107 0.037 TAX 0.720 0.215 -0.105 PTRATIO 0.023 -0.210 0.175 B -0.004 -0.042 0.019 LSTAT 0.024 -0.055 0.271 #-------------------------- output end # # So there exists a linear combination that has only 6.4% variance # compared to the variance for uncorrelated predictors. # This is worth looking at: # We search for the large coefficients in the corresponding linear # combination: # INDUS: -0.25 RAD: -0.63 TAX: 0.72 # After rounding these coefficients some and moving TAX to the other side, # we get the approximate equation: # # 3/4*TAX ~= 1/4*INDUS + 2/3*RAD or TAX ~= 1/3*INDUS + 8/9*RAD # # Now we should convince ourselves that this collinearity is real: windows() plot(TAX ~ I(1/3*INDUS + 8/9*RAD), data=as.data.frame(boston.std), pch=16, cex=1) # (Recall the formula language can also be used for plotting.) # This plot looks very unconvincing: # the correlation seems to be due to a couple of outlying points. # We have to be smarter, though: Does the plot look like it's showing # 506 points? # We must suspect overplotting due to tied values. # With a little bit of jittering we find that the two points are # a whole lot of points. We plot both, jittered and raw: windows(width=6, height=12) par(mfrow=c(2,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(TAX ~ I(1/3*INDUS + 8/9*RAD), data=boston.std, pch=16, cex=.5) # jitter the same: jit <- .1; jit.x <- runif(506,-jit,jit); jit.y <- runif(506,-jit,jit) plot(TAX + jit.y ~ I(1/3*INDUS + 8/9*RAD + jit.x), data=boston.std, pch=16, cex=.5) # In histograms: par(mfrow=c(3,1)) hist(boston.std[,"TAX"], col="gray", breaks=100) hist(boston.std[,"INDUS"], col="gray", breaks=100) hist(boston.std[,"RAD"], col="gray", breaks=100) # The spikes are the 132 census tracts of Boston proper: sel.tax <- boston.std[,"TAX"]>0.5 rownames(boston.std)[sel.tax] sel.ind <- 0.9 < boston.std[,"INDUS"] & boston.std[,"INDUS"]<1.1 rownames(boston.std)[sel.ind] sel.rad <- boston.std[,"RAD"]>0.5 rownames(boston.std)[sel.rad] # We mostly singled out Boston proper: table(sel.tax, sel.ind, sel.rad) rownames(boston.std)[sel.rad & sel.ind & sel.tax] # except for 5 census tract in Chelsea: rownames(boston.std)[!sel.rad & !sel.ind & sel.tax] # Here is a tabulation of all names truncated to the first 6 letters # so we lump all of "Boston..." into one category: sort(table(substring(rownames(boston.std),1,6))) # So it appears the collinearity stems from grouping Boston proper # separately from the rest on these three variables simultaneously. # Strange! # # Final remarks on PCA: # # * As we said, this use of PCA is non-standard. # The standard use of PCA is looking for the largest eigenvariances, # whereas we looked for the smallest, which we interpreted as # approximate implicit equations. # #--------------------- end of class 19 ------------------------------------------- # # comment in class: what if TAX ~= 1/3*INDUS + 8/9*RAD # is substituted in the regression equation? # # # * While we used implicit equations as a collinearity detector, # we can also think of it as a competitor to linear regression. # Its main features: # . Symmetric treatment of all variables, w/o singling out a response. # The strength of the method is that it can tell us in which # variables there exists linear degeneracy. # It could well be that the strongest linear relationships # hold among variables of no interest. # Then again, if we find strong linear relationships among # variables of interest, we ARE interested in knowing. # . We obtain a hierarchy of implicit equations, with decreasing # quality in terms of increasing deviation of the data from the # equation. We say, for example, that (100-6.4)% = 93.6% of the # variation lives "within the equation" and # only 6.4% "outside of the equation". # As we go up the ladder, ever more variation lives "outside". # # Example of the second equation in the Boston Housing predictors: # Its variance is 0.169. Its main coefficients are # NOX: -0.8, DIS: -0.39, AGE: 0.21, TAX: 0.22, PTRATIO: -0.21 # This yields the following rounded approximate implicit equation: # -0.8*NOX - 0.4*DIS + 0.2*AGE + 0.2*TAX - 0.2*PTRATIO ~ 0 # We note that TAX is again involved, although only in a minor role. # More interesting is that NOX is highly involved. It relates mostly # to DIS, so maybe it's best to simplify to these two predictors: # -0.8*NOX - 0.4*DIS ~ 0 or NOX ~ -DIS/2 # Pictorially: windows(width=6, height=12) par(mfrow=c(2,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(NOX ~ DIS, data=boston.std, pch=16, cex=.5) # jitter the same: jit <- .1; jit.x <- runif(506,-jit,jit); jit.y <- runif(506,-jit,jit) plot(NOX + jit.y ~ I(DIS + jit.x), data=boston.std, pch=16, cex=.5) # Look at that! We were off all along by estimating # linear equations, but there is in fact a curved # assocation. It is monotone descending and says that # the closer to the five employment centers we are, # the higher is the pollution, which makes sense. # # Let's check one more, the third smallest PC: # Its major coefficients are AGE: -0.46, DIS: -0.70 # This suggests the following plot: par(mfrow=c(2,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(AGE ~ DIS, data=boston.std, pch=16, cex=.5) # jitter the same: jit <- .1; jit.x <- runif(506,-jit,jit); jit.y <- runif(506,-jit,jit) plot(AGE + jit.y ~ I(DIS + jit.x), data=boston.std, pch=16, cex=.5) # Which is a very nice relation, not necessarily nonlinear. # It says the greater the distance from the employment centers, # the fewer homes built prior to 1940, which makes sense yet again. # The employment centers seem to have been the nuclei from # which the settlement started and later reached out to greater # distances. # # This symmetric treatment makes us wonder whether # the implicit equation boston.std[,14] <- scale(boston.std[,14]) # standardize the response collinearity.pca(boston.std, nlin=10) # begin output --------- $variances PC14 PC13 PC12 PC11 PC10 PC9 PC8 PC7 PC6 PC5 PC4 PC3 PC2 PC1 0.060 0.134 0.183 0.213 0.252 0.277 0.403 0.535 0.660 0.851 0.887 1.349 1.650 6.546 $coefficients PC14 PC13 PC12 PC11 PC10 PC9 PC8 PC7 PC6 PC5 CRIM -0.059 0.097 0.063 0.071 -0.071 0.254 0.157 0.777 -0.225 0.005 ZN 0.096 -0.132 -0.221 0.128 0.246 0.383 -0.380 -0.274 -0.336 0.114 INDUS 0.235 0.084 0.348 -0.274 -0.255 0.627 0.172 -0.340 -0.081 -0.023 CHAS -0.023 -0.050 -0.019 0.010 -0.042 -0.019 -0.033 0.074 0.163 -0.535 NOX -0.088 0.525 -0.449 0.437 -0.212 -0.043 0.047 -0.198 -0.149 0.195 RM -0.007 -0.050 -0.126 -0.224 -0.526 -0.004 -0.438 0.074 0.131 -0.008 AGE 0.038 -0.051 0.486 0.330 0.246 -0.043 -0.588 0.119 -0.061 0.150 DIS -0.047 0.552 0.494 0.115 -0.299 -0.176 -0.128 -0.104 0.012 -0.106 RAD 0.635 -0.006 0.019 -0.042 0.116 -0.463 0.075 -0.137 -0.135 -0.230 TAX -0.699 -0.243 0.170 -0.043 -0.008 -0.179 0.071 -0.314 -0.188 -0.163 PTRATIO -0.056 0.188 -0.232 0.100 0.160 0.275 -0.283 0.001 0.279 -0.616 B 0.016 -0.021 -0.042 -0.039 -0.146 -0.061 -0.044 0.075 -0.786 -0.367 LSTAT -0.083 0.249 -0.182 -0.683 0.067 -0.172 -0.357 0.083 -0.092 0.178 MEDV -0.134 0.470 0.098 -0.242 0.576 0.071 0.152 -0.010 -0.054 -0.051 # end output --------- # # This time we are curious what PCs involve MEDV: # - PC13 (2nd smallest): MEDV: 0.47, DIS: 0.55, NOX: 0.53 # (minor: TAX: -.24, LSTAT: 0.25) # Suprisingly, NOX is part of the equation. # Rounded it is: MEDV + DIS + NOX ~ 0 # This suggests plotting MEDV vs DIS + NOX windows(width=6, height=12) par(mfrow=c(2,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(MEDV ~ I(DIS+NOX), data=boston.std, pch=16, cex=.5) jit <- .1; jit.x <- runif(506,-jit,jit); jit.y <- runif(506,-jit,jit) plot(MEDV + jit.y ~ I(DIS + NOX + jit.x), data=boston.std, pch=16, cex=.5) # Ad hoc R-square for MEDV ~ - ...: recall that the explicit form is not optimal! cor(MEDV, -as.matrix(boston.std)[,-14] %*% collinearity.pca(boston.std, nlin=14)$coeff[-14,"PC10"])^2 0.5598944 # How does this compare to the LS fit which is the optimized explicit form? summary(lm(MEDV ~ ., data=boston.std)) Multiple R-Squared: 0.7406, Adjusted R-squared: 0.7338 # Of course there is a considerable difference. # # A reminder: scatterplot matrix, jittered. # Wondering why we didn't see certain things earlier, e.g., the Boston clump? eps <- .0; jit <- runif(nrow(boston.std)*ncol(boston.std), -eps, eps) pairs.plus(boston.std+jit, pch=16, cex.labels=.8, gap=.1, cex=.1, jitter=.4) # It's visible in the plots involving RAD and TAX # for jittering with eps=.4, but not eps=0 (no jittering). ################# # # Discussion of collinearity: # # * Collinearity affects how well determined the coefficients are, but... # # * Collinearity is a special case of NON-IDENTIFIABILITY. # # Intuitively, non-identifiability exists whenever model parameters # can compensate for each other to achieve roughly equivalent data fits. # # * Marginally, non-identifiable model parameters show the # collinearity/non-identifiability problem in terms of inflated # standard errors. # # * The problem with standard errors is that they hide # how the compensations among model parameters take place. # # A large standard error does not mean that the parameter can be # changed arbitrarily without affecting the data fit. # Some other parameter has to be changed to compensate. # # For example if the collinearity implies a1*x1 + a2*x2 ~ 0, # and if the LS fit is y ~ b1*x2 + b2*x2, # then a change b1 -> b1+a1 can be compensated for by b2 -> b2+a2. # # * Non-identifiability can occur in all modeling attempts. # In a talk at Wharton, David Scott (Rice U, 2007/03/21) showed # how the mixture components of mixture models are # highly non-identifiable. He demonstrated a technique # (also based on eigen-decompositions) that visualizes # the equivalence of the fitted densities while changing # the mixture parameters. # Here is what he may have done for linear regression: boston.LScoeffs <- lm(MEDV ~ ., data=boston.std)$coeff[-1] boston.collinear <- collinearity.pca(boston.std)$coeff[,1:2] boston.evals <- collinearity.pca(boston.std)$variances[1:2] windows(wid=10, hei=6) for(eps in seq(0,10*2*pi,by=.01)) { par(mfrow=c(3,5), mar=c(1,1,1,1)) for(i in 1:length(boston.LScoeffs)) { plot(x=c(-1,1), y=c(-1,1), xaxt="n", yaxt="n", xlab="", ylab="", type="n") text(x=0, y=.9, lab=names(boston.LScoeffs)[i]) abline(b=boston.LScoeffs[i], a=0, col="red") abline(b=boston.LScoeffs[i] + boston.collinear[i,1]*cos(eps)*sqrt(boston.evals[2]/ sum(boston.evals)) + boston.collinear[i,2]*sin(eps)*sqrt(boston.evals[1]/ sum(boston.evals)), a=0, col="blue") points(x=cos(eps), y=sin(eps), pch=16, col="gray") } } # We can read off the motion which coefficients are ill-determined # and how they compensate for each other. # # Compare this with the bootstrap spaghetti bands which are # uninformative wrt to compensation (they reflect standard error): par(mfrow=c(3,5), mar=c(1,1,1,1)) for(i in 1:length(boston.LScoeffs)) { plot(x=c(-1,1), y=c(-1,1), xaxt="n", yaxt="n", xlab="", ylab="", type="n") text(x=0, y=.9, lab=names(boston.LScoeffs)[i]) for(jb in 1:100) abline(b=lm(MEDV ~ ., data=boston.std[sample(1:nrow(boston.std), replace=T),])$coeff[-1][i], a=0, col="gray") abline(b=0, a=0) abline(b=boston.LScoeffs[i], a=0, col="red") } # # # * Predictions are NOT affected by collinearity. # Predictions are affected by whether we extrapolate or not. # => For prediction one needs to keep in mind the "range of experience" # (= the design = the predictor distribution in the training sample) # Detection of extrapolation with many predictors and/or in # highly non-linear models can be difficult. # # I have never seen a proposal for a new regression method # accompanied by a diagnostics for detecting extrapolation. # If models are sufficiently non-linear, situations we might # consider "interpolations" actually turn out to be "extrapolations". # # ################################################################ # # PCA FOR DIMENSION REDUCTION # # Above we used PCA to find approximate implicit linear equations # among predictors of linear regression problems. This is a lesser # known application of PCA. # Now we look at PCA as a dimension reduction tool, which is the # common use of PCA. # # We said that PCA finds coefficient vectors that optimize variance # of linear combinations under a quadratic constraint on the coefficients, # For collinearity detection were were interested in smallest variances, # but for dimension reduction we want largest variances. # We think of a multivariate dataset as points in p-dim space, # and if the variables are highly correlated, the point cloud # will be rather flat, that is, intrinsically approximately of # fewer dimensions than 'p'. PCA for dimension reduction has # the goal of finding the few essential dimensions, essentially # by constructing a coordinate system in p-space whose first # coordinate has the largest variance, the second coordinate # has the second larges variance (orthogonal to the first),... # # We also said earlier that this means solving an eigenvalue problem # for a correlation matrix. We could just apply the R function 'eigen()' # to correlation matrices, but instead we use a souped-up function # which has some niceties, in particular with regard to labeling # of the results: pca <- function(x, scale=T, center=T, proj=T) { nobs <- nrow(x); nvars <- ncol(x) x <- scale(x, scale=scale, center=center) s <- svd(x/sqrt(nobs-1)) pca.lst <- list(evals=s$d^2, loadings=t(t(s$v)*s$d)) if(proj) pca.lst$projections <- t(t(s$u)*s$d)*sqrt(nobs-1) names(pca.lst$evals) <- paste("eval ",1:nvars,sep="") dimnames(pca.lst$loadings) <- list(colnames(x),paste("load ",1:nvars,sep="")) if(proj) dimnames(pca.lst$projections) <- list(rownames(x),paste("proj ",1:nvars,sep="")) for(j in 1:nvars) if(-sum(pmin(pca.lst$l[,j],0))>sum(pmax(pca.lst$l[,j],0))) # sign change { pca.lst$loadings[,j] <- -pca.lst$loadings[,j]; if(proj) pca.lst$projections[,j] <- -pca.lst$projections[,j] } return(pca.lst) } # The function returns a list with a these components: # - e'values or principal variances that describe the # 'fatness' of the data along of each PC direction # described by PC coefficients. # - a matrix of PC coefficients, also called 'loadings'; # the columns are coefficient vectors that define linear # combinations; the columns are normalized so their # squared lengths are the same as their respective # eigenvalues or variances. # - a matrix of the same size as the data matrix containing # the data represented in the PC coordinate system; # the PC coordinate system is orthogonal and data # are uncorrelated in it # An example: mktg.pca <- pca(mktg[,1:8]) # omit the segmentation variable 9 names(mktg.pca) # Here is how to use the output: # # 1) Eigenvalue Profile: length(mktg.pca$eval) plot(mktg.pca$eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data", ylim=c(0,max(mktg.pca$eval))) # One can also easily eyeball the values: round(mktg.pca$eval, 3) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 2.640 1.689 1.034 0.812 0.673 0.527 0.422 0.203 # It shows that the largest principal components has a variance of 2.640, # the second largest a variance of 1.689,... # The sum of the variances equals the number of the variables: sum(mktg.pca$eval) # (For those who know lin.alg.: an orthogonal coordinate transformation # preserves the trace of a linear map, here the correlation matrix # which has 'p' 1s in the diagonal.) # Therefore, the eigenvalue profile has a 'budget' of size 'p', which # it allocates to the PCs in descendig order. In the extreme case # of a totally uncorrelated set of input variables, all eigenvalues # are 1, and the eigenvalue profile is completely flat. # (Reason: the correlation matrix is the identity, which remains the # same under an orthogonal coordinate transformation.) # Eigenvalue profiles are often associated with cumulative variances: round(cumsum(mktg.pca$eval)/sum(mktg.pca$eval)*100, 1) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 33.0 54.1 67.0 77.2 85.6 92.2 97.5 100.0 # One says that the first PC accounts for 33% of the variance, # the first two PCs for 54.1% of the variances,... # # Since variance is squared standard deviation, the root of # the eigenvalues is a better reflection of the spread of the # data in the eigendirections: plot(sqrt(mktg.pca$eval), pch=16, type="b", ylab="Root-Eigenvalues of Mktg Data", ylim=c(0,max(sqrt(mktg.pca$eval)))) # So this look a little less dramatic: round(sqrt(mktg.pca$eval), 2) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 1.62 1.30 1.02 0.90 0.82 0.73 0.65 0.45 # The ratio of largest to smallest standard deviation is only 1.62 / 0.45 [1] 3.6 # # 2) Eigenvectors: dim(mktg.pca$load) round(mktg.pca$load, 3) load 1 load 2 load 3 load 4 load 5 load 6 load 7 load 8 in. 0.713 0.429 -0.090 -0.003 0.133 0.162 -0.503 0.053 out 0.850 0.337 -0.102 0.001 0.020 -0.084 0.179 -0.337 rev 0.812 0.378 -0.076 -0.053 -0.072 -0.120 0.291 0.292 bus 0.401 -0.404 0.457 0.497 0.451 -0.125 0.019 0.028 reach 0.473 -0.482 -0.039 0.424 -0.574 0.180 -0.043 -0.002 kids 0.364 -0.628 -0.342 -0.261 0.296 0.426 0.136 0.014 age -0.378 0.672 0.184 0.330 0.060 0.478 0.175 -0.004 edu 0.301 -0.088 0.805 -0.454 -0.161 0.147 -0.001 -0.018 # The idea is to interpret the coefficients based on their # signs and magnitudes: # - The first PC direction (column vector) is strongly # driven by the first three variables, those that measure # overall usage volumn. To a lesser degree, it also relies # on the needs variables 'bus', 'reach', 'kids', then on # inverse age, and finally on education. Thus the first PC # is a direction that measure overall 'interestingness to # the company': this PC is large if the usage volumn is large, # if there are needs, if the household is young and well- # educated, all of which makes a customer household interesting. # - The second PC direction is a contrast between usage volumn # on the one hand and the other variables on the other hand. # This PC is large if usage is high but all the qualitative # variables are uninteresting. # - I wouldn't interpret the 3rd through 7th PC because they # are too close to 1 and each other, which makes them # ill-determined. (If two eigenvalues are identical, # they have infinitely many eigenvectors.) # - The smallest PC is largely a difference between 'out' and # 'rev', which makes sense: if after standardization the # difference 'out-rev' is small, it means out ~ rev, # which is confirmed by plot(scale(mktg[,c(2,3)]), pch=16); abline(a=0,b=1) # # 3) PC Projections: windows(10,10); par(mgp=c(1.8,.5,), mar=c(3,3,2,1)) xlim <- ylim <- range(mktg.pca$proj) pairs.plus(mktg.pca$proj, pch=".", cex.lab=.5, xlim=xlim, ylim=ylim, gap=.1) #---------------------- end of class 20 ------------------------------------------ # It is mandatory that the limits be fixed for all frames so that # the relative dispersions are visible. The smallest principal # components should show their small spreads as reflected by their # small variances (=eigenvalues). # [If the data had the shape of a pancake in p-space, then the # first two dimensions would show the pancake top-down. # The remaining dimensions would who it over the edge. # The mktg data here are not very 'flat', hence not very # pancake-shaped.] # The striations in some of the pictures should not worry us: # they correspond to layers formed by the discrete variables # 'bus', 'reach', 'kids', 'age', 'edu'. # An interesting use of PCA for this dataset is to show the # nature of the segmentation: pairs.plus(mktg.pca$proj, pch=".", cex=1, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,"seg"]], gap=.1) # Here is an enlarged version of the largest two PCs for # interpreting the segments: windows(5,5) par(mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot.plus(mktg.pca$proj[,1:2], pch=16, cex=.5, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,"seg"]]) # It shows that the 'kmeans' clustering algorith used to form the # segments does not find separated clusters. Most surprisingly, # it does not find the 'natural' clusters formed by the levels # of the binary variables 'bus', 'reach', 'kids'. Rather, it # roughly carves up the 'pie' in the two largest dimensions # found by PCA. # Similarly, one can color business use, kids, reach needs, even discretized usage: windows(wid=12, hei=6) par(mfrow=c(2,4), mar=c(1,1,2,2), mgp=c(1.5,.5,0)) xlim <- ylim <- c(-5,5) for(i in names(mktg)[1:8]) plot.plus(mktg.pca$proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, main=i, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[(mktg[,i]>median(mktg[,i]))+1]) ################ # # PCA: math ## * standard description: eigendecomposition of the correlation matrix ## (warning: do not believe those who say one should use the ## covariance matrix; this makes sense only when all ## variables live on the same scale, e.g., each case ## is a time series) ## * problem solved: Var( ) = stationary subject to |a|^2 = 1 ## interpretation: Var(a1*X1+...+ap*Xp) = stationary ## subject to Var(a1*X1)+...+Var(ap*Xp) = 1 ## In this interpretation there is no need to standardize the variables. ## As posed, it automatically amounts to an eigenanalysis of the corr matrix. ## "Stationary" means ... and the eigenvalues are the ... ## * Properties of the eigendecomposition: ## Corr = U D t(U) where U is pxp orthogonal: t(U) U = I, D is pxp diagonal ## trace(Corr) = ... = trace(D) = ... ## interpretation: trace(Corr) = ... ## average eigenvalue = ... ## => ... is taken as a dividing line between small and large eigenvalues ## This does not make sense, though, because of chance capitalization. ## See inference below ################ # # Another data example: Swiss fertility swiss.pca <- pca(swiss) # # 1) eigenvalue profile: plot(swiss.pca$eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data") round(swiss.pca$eval, 3) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 3.200 1.188 0.848 0.439 0.205 0.121 round(cumsum(swiss.pca$eval)/sum(swiss.pca$eval)*100, 1) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 53.3 73.1 87.3 94.6 98.0 100.0 # The first PC accounts for an astounding 53.3% of the variance! # Given the profile plot, one could say the data are largely # one-dimensional. # # 2) eigenvectors/loadings: round(swiss.pca$load, 3) load 1 load 2 load 3 load 4 load 5 load 6 Fertility 0.817 0.351 -0.160 -0.355 0.173 0.164 Agriculture 0.759 -0.449 0.035 0.426 0.170 0.107 Examination -0.912 0.136 -0.084 0.036 0.368 -0.078 Education -0.813 0.195 0.490 0.065 -0.032 0.237 Catholic 0.626 0.159 0.743 -0.066 0.083 -0.140 Infant.Mortality 0.268 0.884 -0.147 0.349 -0.047 -0.026 # The first PC is a contrast of Fertility, Agriculture, # Catholicism versus Examination and Education. # This essentially reflects the positive correlations # between Fertility, Agriculture, Catholicism and the # negatives of Examination and Education. # # 3) PC projections: windows() par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) xlim <- ylim <- 1.1*range(swiss.pca$proj) pairs.plus(swiss.pca$proj, pch=16, cex=.5, xlim=xlim, ylim=ylim) # Here the most interesting plot is Projection 1 versus 3, which # reveals natural clustering. plot(swiss.pca$proj[,c(1,3)], xlim=xlim, ylim=ylim, pch=16) identify(swiss.pca$proj[,c(1,3)], labels=rownames(swiss), cex=.8) # This reveals the clusters as basically protestant and catholic, # but Geneva all on its own in the top left, with Rive Gauche # and Rive Droite nearby. ################ # # Inference for PCA: bootstrap and permutation tests # # 1) A crude null hypothesis is that there is no correlation # among the variables, implying that the eigenvalue profile # is flat. This can be easily simulated with permutations # of the columns. Here is a spaghetti band for the null # eigenvalues: plot(swiss.pca$eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data") for(i in 1:100) lines(pca(apply(swiss, 2, sample))$eval, col="gray") points(swiss.pca$eval, type="b") # The band is instructive because it reveals the natural chance # capitalization of the eigenvalues even in the total absence # of correlation in the underlying population. The randomness # in the finite sample introduces small spurious correlations # which PCA exploits. We see that a largest PC eigenvalue of # about 1.5 or even 1.8 would be entirely compatible with the # complete absence of correlations among the variables. # 2) Bootstrap: boot <- function(x) x[sample(1:nrow(x),repl=T),] plot(swiss.pca$eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data", ylim=c(0,4)) for(i in 1:100) lines(pca(boot(swiss))$eval, col="gray") points(swiss.pca$eval, type="b") # Unfortunately, the band cannot be expected to contain the true # eigenvalue profile, only the expected values of the eigenvalues # for this sample size. The problem is that the largest eigenvalues # are biased upward and the smallest biased downward due to chance # capitalization. # Think about it! The mere fact that we sort random entities # makes the largest biased upward, the smallest biased downward. # # 3) Bias-correction for eigenvalues with bagging: # Eigenvalues suffer from upward bias on the upper end # and downward bias at the lower end, as we have seen in # the simulation of uncorrelated data. # This problem is the severer... # - the larger the number of variables # - the smaller the sample size, and # - the flatter the eigenvalue profile is. # We can use bootstrap for bias correction. # To this end we write an R function that averages eigenvalue # profiles over bootstrap samples, so-called 'bagging' (Breiman): pca.bag <- function(x, Nboot=1000) { x.pca.evals.boot <- rep(0,ncol(x)) for(i in 1:Nboot) x.pca.evals.boot <- x.pca.evals.boot + pca(boot(x), proj=F)$eval x.pca.evals.boot <- x.pca.evals.boot/Nboot return(x.pca.evals.boot) } # Let us illustrate the bias problem with a small subsample # of size 30 of the marketing data in order to make the # ratio p:N large: mktg.sub <- mktg[sample(1:nrow(mktg),30),-9] # Here are the raw and the bagged profiles: plot(pca(mktg.sub)$eval, pch=16, type="b", ylab="Eigenvalues of Reduced Mktg Data", ylim=c(0,3.5)) points(pca.bag(mktg.sub), type="b", col="blue") # We see that the bias increases in magnitude on both ends. # Here is once again the raw profile, this time accompanied by # a bias-corrected profile: plot(pca(mktg.sub)$eval, type="b", pch=16, ylab="Eigenvalues of Reduced Mktg Data", ylim=c(0,3.5)) points(2*pca(mktg.sub)$eval - pca.bag(mktg.sub), type="b", pch=16, col="blue") # We have a reversal from eigenvalue 3 to 4 after bias correction, hence # one should say that the eigenvalues 3 and 4 are virtually identical. # Proper eigenvalue profiles are strictly ordered, but # bias-correction can violate the order. # # Overall we should be aware of the bias problem # especially when there are many variables, and/or few observations, # and/or the eigenvalue profile is relatively flat. ################################################################ # # K-MEANS CLUSTER ANALYSIS # # The K-means clustering algorithm tries to divide the data # into a pre-specified number of groups (K) in such a way # that the groups are somehow homogeneous and separated # from each other if possible. # # The algorithm works by placing K tentative cluster centers # into data space. The algorithm then moves the centers # around till a certain criterion is minimized. # The criterion is the sum of squared distances to the # nearest cluster center: # # sum_{i=1..N} min_{j=1..K} || x_i - m_j ||^2 = min_{m_1,...,m_K} # # where m_1,..., m_K are the cluster centers in p-space # and x_1,..., x_N are the N data vectors/rows/cases/units/records. # Obviously, the k-means clustering algorithm assumes quantitative # variables. # # Example in R: function 'kmeans()', data: telecom marketing # We use a random selection of K=4 rows as initialization of # the cluster centers. dat <- scale(mktg[,-9]) N <- nrow(dat) p <- ncol(dat) K <- 4 # decide on 4 clusters/segments mktg.km <- kmeans(dat, iter.max=1000, centers = dat[sample(N, size=K),] ) # Note that the number of cluster is implicit in the dimensions of 'centers'; # no need for a special argument for K. # Examine the returned data structure: names(mktg.km) # The following lists the breakdown into mktg.km$size # The final centers are the following: mktg.km$centers # The assignment of the rows of 'mktg' to the clusters is in: mktg.km$cluster # In what follows we will use a repackaged function that # picks K random rows for initialization. It is in source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/segs.R") # and its call is 'k.means(x, K=4)'. # # We'll look at the clusters in the first four principal component # coordinates computed earlier: windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) xlim <- ylim <- range(mktg.pca$proj) pairs(mktg.pca$proj[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg.km$cluster]) # Now this looks different from the original clustering: windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) pairs(mktg.pca$proj[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,9]]) # # Now let us compare a sequence of clusterings: par(mfrow=c(4,4), mar=c(3,3,1,1), mgp=c(1.8,.5,0)) for(i in 1:16) { clust <- k.means(scale(mktg[,-9]), K=4)$cluster # plot(mktg.pca$proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust]) } # There is a distraction in that the clusters are not matched in colors. # This can be helped: our 'k.means()' function can be given a clustering # vector, and the newly created solution will be immediately aligned # with the given clustering. This is done by giving 'k.means()' the # argument 'K=clustering.to.be.aligned.with'. In fact, the full name # of this argument is 'K.or.align'. If this argument is given, then # K is inferred from it. # # Here are the original clustering with 15 randomly initialized clusterings # aligned with the original clustering: par(mfrow=c(4,4), mar=c(3,3,1,1), mgp=c(1.8,.5,0)) plot(mktg.pca$proj[,1:2], pch=16, cex=.5, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,9]]) for(i in 1:15) { # alignment below... clust <- k.means(dat, K=mktg[,9])$cluster plot(mktg.pca$proj[,1:2], pch=16, cex=.5, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust]) } # So there seems to exist a degree of dependence on the initialization! # Comparing the 16 plots, however, it appears that there are several # clusterings that look identical. So maybe there are only a few discrete # local solutions for the kmeans criterion? # # If the k-means algorithm finds different solutions, how do they # perform in terms of the k-means criterion? Let's collect a # list of 500 clusterings and analyze the criterion values: mktg.km.lst <- list() # takes a few minutes: <2sec/100 kmeans for(i in (length(mktg.km.lst)+1):500) { if(i%%100==0) cat(i,"") mktg.km.lst[[i]] <- k.means(dat, K=mktg[,9]) } # (Among many calls to kmeans() it can happen that one of them crashes; # rerun, and it will continue from where it crashed.) # # Peel off the criterion values: mktg.km.crit <- unlist(lapply(mktg.km.lst, function(x) sum(x$withinss)) ) # (Check out 'lapply()' and 'unlist()': spares you a loop) # Look at them: windows(10,10) hist(mktg.km.crit, breaks=1000, col="gray", xlim=c(9100,10000)) # Oh, there are discrete sets of values that appear many times over!! # In fact, there are only 54 different values of the criterion: length(unique(mktg.km.crit)) [1] 53 # Let's collect them: mktg.km.crit.uniq <- sort(unique(mktg.km.crit)) # and tabulate them: table(mktg.km.crit) 9240.92733734854 9313.91364263915 9313.91740353386 9313.97250010717 2624 1122 347 733 9314.0971039678 9319.76071481234 9319.8238068561 9319.91076889382 1364 1574 303 11 9336.89070097278 9336.90739225535 9336.9346563692 9370.29454375176 546 185 289 30 .... # Could the values correspond to different geometric patterns of the # cluster centers? How could we confirm this hunch? Let's plot # in a 4x4 matrix layout again, but for one particular value # of the criterion at a time. Execute the following repeatedly, # but pick different values from the vector 'mktg.km.crit.uniq': crit <- mktg.km.crit.uniq[10] # (try for 1, 2, 3, ...) par(mfrow=c(4,4), mar=c(3,3,1,1), mgp=c(1.8,.5,0)) xlim <- ylim <- range(mktg.pca$proj[,1:2]) for(i in 1:16) { clust <- mktg.km.lst[mktg.km.crit==crit][[i]]$cluster plot(mktg.pca$proj[,1:2], pch=16, cex=.5, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust]) } # Check whether the configurations for the same criterion value differ at all: crit <- mktg.km.crit.uniq[2] # (try for 1, 2, 3, ...) sel