################################################################ ################################################################ # # TREE-BASED REGRESSION: # # We make an excursion into the very opposite of smoothing: # # CART -- Classification and Regression Trees # # It might as well be called "roughing". # In what follows we consider the regression version only. # Tree-based regression is simpler than multiple linear regression, # but more unusual for those who grew up with the latter. # # The idea: # # When data are heterogeneous, clustered, messy, # it might not be the best idea to fit smooth functions; # in particular, it might not be a good idea to fit linear functions, # as is done in linear regression. # Instead, one might consider breaking the data up into # subsets that are homogeneous. # Once we have homogeneous subsets, we might do the simplest possible thing: # fit a constant, that is, a mean. # Therefore, tree-based regression does local averaging, like smoothers, # but this time on non-overlapping subsets. # # Q: How do we arrive at homogeneous subsets? # A: By repeatedly breaking the data up along axis splits. # # But how would we choose such axis splits? # # DATA EXAMPLE: the Boston Housing data # Source: Harrison and Rubinfeld (1978) # ``Hedonic prices and the demand for clean air'' # Journal of Environmental Economics and Management # Vol. 5, pp. 81-102. # # Cases: 506 census tracts around Boston. # # Variable description # ---------------------------------------------------------------------------- # CRIM crime rate # ZN prop. of residential land zoned for lots over 25,000 sq. ft # INDUS prop. of non-retail business acres # CHAS Charles River dummy variable (=1 if tract bounds river; 0 else) # NOX nitric oxides concentration, pphm # RM average number of rooms per dwelling # AGE proportion of owner-occupied units built prior to 1940 # DIS weighted distances to five Boston employment centers # RAD index of accessibility to radial highways # TAX full-value property tax rate per $10,000 # PTRATIO pupil teacher ratio # B 1000*(Bk-0.63)^2 where Bk is the proportion of blacks # LSTAT percent lower status population # MEDV median value of owner occupied homes in $1000's # ---------------------------------------------------------------------------- # MEDV is the response, all others are predictors. # NOX is the predictor of main interest: proxy for air pollution # Download the data from the instructor's website: url.buja <- "http://stat.wharton.upenn.edu/~buja/STAT-541/" boston <- read.table(paste(url.buja,"boston.dat",sep=""), header=T) rownames(boston) <- paste(scan(paste(url.buja,"boston.geography",sep=""),w=""), 1:nrow(boston) ) dim(boston) # Size of the data frame names(boston) # Variable names # Function for scatterplot matrices: source(paste(url.buja,"pairs-plus.R",sep="")) pairs.plus(boston, pch=16, cex=.1, gap=0) # Scatterplot matrix summary(lm(MEDV ~ ., data=boston)) # Linear model # Back to splitting variables: # In the following plot, where would we reasonably place a split? plot(boston[,"LSTAT"], boston[,"MEDV"], pch=16, xlab="LSTAT", ylab="MEDV", cex=.8) # Here is a handchosen split at LSTAT=10: spl <- 10 sel.left <- boston[,"LSTAT"] < spl sel.right <- boston[,"LSTAT"] >= spl plot(boston[,"LSTAT"], boston[,"MEDV"], xlab="LSTAT", ylab="MEDV", col=c("red","green")[sel.left+1], pch=16, cex=.8) # We said we will fit only the simplest possible model, a # mean in each subset: lines(c(spl,spl), c(0,50)) lines(c(0,spl), rep(mean(boston[sel.left,"MEDV"]), 2), lwd=2) lines(c(spl,40), rep(mean(boston[sel.right,"MEDV"]), 2), lwd=2) # # How do we formalize a rule for selecting the axis splits? # That's the big question. # Here is a recipe that is used most often: # # For a given split on a variable "xj" (="LSTAT" in the plot) # and threshold "th" (=10 in the plot), # evaluate a criterion, such as the residual sum of squares, # after fitting a separate mean left and right of the threshold: sum((boston[sel.left,"MEDV"] - mean(boston[sel.left,"MEDV"]))^2) + sum((boston[sel.right,"MEDV"] - mean(boston[sel.right,"MEDV"]))^2) # which should be the same as var(boston[sel.left,"MEDV"]) * (sum(sel.left)-1) + var(boston[sel.right,"MEDV"]) * (sum(sel.right)-1) # Indeed! # # Then: # For all possible splits (505 for "boston") # of all x-variables ( 13 for "boston"), # evaluate the criterion, # and pick the split that has the lowest value of the criterion: lstat.vals <- sort(unique(boston[,"LSTAT"])) split.vals <- (lstat.vals[-1] + lstat.vals[-length(lstat.vals)])/2 crit.vals <- rep(NA, length(split.vals)) for(i in 1:(length(split.vals))) { spl <- split.vals[i] sel.left <- boston[,"LSTAT"] < spl; sel.right <- !sel.left crit.vals[i] <- sum((boston[sel.left,"MEDV"] - mean(boston[sel.left,"MEDV"]))^2) + sum((boston[sel.right,"MEDV"] - mean(boston[sel.right,"MEDV"]))^2) } plot(split.vals, crit.vals, pch=16, cex=.2) # Look at that: spl=10 was near the minimum of the criterion values! # Here is how you get the minimum location: split.vals[crit.vals==min(crit.vals)] [1] 9.725 # # Then: # Repeat on both sides of the split. # repeat on and on and on..., recursively. # Stop when the subsets get too small. # # There are many complications to the idea, but let's leave things simple. # We are going to use trees not for prediction but for interpretation. # [Trees for prediction would need sophisticated cross-validation. # See the book by Breiman, Friedman, Olshen, Stone (1982).] # There exists an implementation of tree-based regression and classification # authored by Clark and Pregibon, documented in the "White Book", # "Statistical Models in S", edited by Chambers and Hastie. # You may have to download and install it: # * To this end, make sure you are connected to the internet! # * In the Windows version of R you can do that with the pull-down # menu "packages". # * In the Linux and Emacs version of R you can do it with the # following function: install.packages("tree") # (Or you can use the GUI: Packages > Install Package(s) > ...) # In every session before using a library function you have to call library(tree) # # We will use another implementation, though, from the PhD thesis # of a student of mine. The implementation is slow because # it is all written in R, not C or Fortran, # but it has some unique ways of constructing trees that are # often more interpretable than standard trees. # The trick is in the choice of the splitting criterion. # [If you are curious about details, you can download # the Ph.D. thesis paper from the class web page.] # # Download first the file "trees.R" from the class web page # and source it: source(paste(url.buja,"trees.R",sep="")) # # Here is the conventional "CART" method: boston.CART <- regression.tree(boston, method="CART", minsize=23, title="Boston Housing Data") show.tree(boston.CART, type="u", cex=.8) # [Stretch this window to avoid much of the overplotting.] # # The tree looks interesting and we should stare at it for a while # to get a sense of what it might tell us: # The first split is on RM, the average number of rooms: # 85% with RM < 6.94, 15% with RM > 6.94 # [small/normal homes] [large homes] # The 85% of smaller homes is split on LSTAT, # 51% with LSTAT < 14.55, 15% with LSTAT > 14.55 # [^^few poor people^^^] [^^more poor people^^] # The 15% of larger homes is split on RM again, # 9.1% with RM < 7.42, 5.9% with RM > 7.42 # [medium large homes] [largest homes] # This makes some sense: # among large homes, size drives the price; # among small/normal homes, social environment drives the price. # The variable of interest was really NOX, but it is only of # low importance in the tree. # (Martin commented on the statistical stability of interpreting # splits and buckets of trees: they cause a "massive simultaneous # inference problem." True! Here two spontaneous responses: # # - You can always "shake up the trees" by running them on # bootstrap samples and comparing the plotted trees. # Interpret only those splits that are roughly the same in # all bootstrap samples. for(i in 1:5) { boston.CART <- regression.tree(boston[sample(1:nrow(boston), replace=T),], method="CART", minsize=23, title="Boston Housing Data") show.tree(boston.CART, type="u", cex=.8) } # # - Another "inferential hack": Add some "junk predictors", # and interpret only splits that are consistently above # splits on junk predictors: nj <- 10 # Number of junk predictors added for(i in 1:5) { boston.w.junk <- cbind(junk=matrix(rnorm(506*nj),ncol=nj),boston) boston.CART <- regression.tree(boston.w.junk, method="CART", minsize=23, title="Boston Housing Data") show.tree(boston.CART, type="u", cex=.8) } # Uhuh! Apparently the bucket three down on the left then right, # LSTAT>=9.71, should not be split because junk is used occasionally. # # Let's move on to one of our new methods and commit a heresy: # Traditional tree-growing uses criteria that compromise # between left and right buckets of splits: # |L| * Crit(L) + |R| * Crit(R) # where, e.g., Crit(B) = Var(y|B) # Our heresy is to take a data mining attitude and say: # "We are happy if one of the two buckets, L OR R, look interesting. # In other words, we don't want a compromise; # instead, we want the better of L and R." # As a consequence, we would use something along these lines: # min(Crit(L), Crit(R)) # This was heresy #1. Heresy #2 is to abandon the idea of "impurity". # This stems from the question of what we are really interested in: # Good fit, or extreme response values? # In data mining applications, one would rather seek extremes: # high returns, extreme spenders among customers, # very diabetic patients, highly likely work-at-home residences,... # As a consequence, we abandon things such as variances and # go straight for means: Crit(B) = mean(y|B) # The final proposal is, therefore: # Search for splits where there is on one side an # extremely low (or high) mean. # The proposal has two incantations, not equivalent: # Seeking very low means versus Seeking very high means. # # Let's look at the bostong housing data to see how this works out: # # The first version tries to repeatedly peel off subsets that have # HIGH response values: boston.XM <- regression.tree(boston, method="XM", minsize=23, title="Boston Housing Data") show.tree(boston.XM, type="u", cex=.7) # The story becomes clearer: LSTAT is the most important variable, but # to predict where the highest prices are, one has to look at RM. # And here is the opposite: repeatedly peel off subsets with # LOW response values: boston.NM <- regression.tree(boston, method="NM", minsize=23, title="Boston Housing Data") show.tree(boston.NM, type="u", cex=.7) # Another story emerges, complementary to the first story, and # more complete, more intuitive!! # In order to ascend housing prices, from lowest to highest, # peel off high crime, # then racially segregated areas, # then descend down the % of lower status people, # then climb up the number of rooms. # Among the three trees we constructed, this one makes the most sense. # ################ # # ANOTHER DATA EXAMPLE: the Swiss fertility data # 47 French speaking provinces of Switzerland around 1888, # from the era of industrialization. # # Fertility - standardized fertility measure for each province # Agriculture - population involved in agriculture # Examination - draftees receiving highest mark in army exam # Education - population educated beyond primary school # Catholic - population who are catholic # Infant.Mort - live births living less than 1 year # The interest is to relate Fertility (the response) to the other # variables (the predictors). # The hypothesis is that the predictors are good proxies for the true # causes of high and low fertility. # Note our cautious formulation: these variables are probably not # direct causes of fertility but part of a cultural mix that overall # determines fertility rates. data(swiss) # The dataset comes with R options(width=100) # I like a longer line for a wide window. dim(swiss) # The size of the data names(swiss) # The variables rownames(swiss) # The provinces swiss # A small dataset, so we can eye-ball it. # For my own interest, I wanted to know the names of some of these # provinces, especially those that are high in Education and # Fertility. # To this end, we obtain an index vector "ord" with the "order" # function to sort the data, first according to Education, then # Fertility: # - High Education: ord <- rev(order(swiss[,"Education"])) rownames(swiss)[ord][1:7] # 7 highest on education # ...includes Geneva, Neuchatel, Lausanne, Vevey, which are "larger" # cities in French speaking Switzerland. I believe Rive Gauche and # Rive Droite are suburbs of Geneva (left and right of the river # Rhone), but I'm not sure. # - High Fertility: ord <- rev(order(swiss[,"Fertility"])) rownames(swiss)[ord][1:10] # 10 highest on fertility # ...names that seem rural or alpine to me (I forgot a lot about # Switzerland. Look at a map of Switzerland.) # - Low Fertility: ord <- order(swiss[,"Fertility"]) rownames(swiss)[ord][1:10] # ...includes the larger cities. # Now some basic plots: response (Fertility versus predictors): windows(width=25, height=5) par(pch=16, mfrow=c(1,5), mar=c(2,2,1.5,0), mgp=c(1.5,.5,0), cex=1, cex.axis=.5, cex.main=.6) for(i in 2:6) plot(swiss[,i], swiss[,1], main=colnames(swiss)[i], xlab="", ylab="", cex=.5) # If these marginal plots are any indication, we can expect # Education and its proxy, Examination, to be negatively correlated # with Fertility. # On the other hand, Agriculture and Infant.Mort seem to be positively # correlated; which may make sense because these are generally # indicators of a low developmental state ("backwardness"). An # interesting connection exists with Catholic: highly catholic # provinces have higher fertililty than those that are highly # protestant, with the exception of three provinces that are highly # mixed, and probably more cosmopolitan: they are Geneva, Rive Gauche # and Rive Droite: windows() plot(swiss[,"Catholic"], swiss[,"Fertility"], pch=16, xlim=c(-20,120)) for(i in 1:20) identify(swiss[,c("Catholic","Fertility")], labels=rownames(swiss)) # Click the three bottom points in the plot... # # These marginal findings call for an investigation of # "collinearities", that is, dependencies among the predictors to # answer questions of whether Examination and Education are indeed # proxy for each other, for example. The easiest approach is with a # scatterplot matrix: pairs.plus(swiss, pch=16, cex.labels=1.5, gap=.1) # Right away we see that Examination and Education are indeed # positively correlated, that Agriculture negatively correlates with # both, although more so with Examination, and that Infant.Mort does # not correlate with anything very much, except for a positive # correlation with Catholic. # Finally, here is an application of the "tree()": swiss.Scart <- tree(Fertility ~ ., data=swiss, control=tree.control(47, mincut=3, minsize=6)) show.tree(swiss.Scart, type="u", cex=.8) # Observations: # - The top split breaks out a group of 7 provinces (=14.9% of 47) with # high education levels and low fertility. # - In the low Education bucket, the tree splits on Catholic with the # expected direction: the high Catholic bucket has higher fertility. # - The next level shows splits on Infant.Mort on both sides, with # the expected direction: higher Infant.Mort, higher Fertility. # (This latter relation may not be a causal left to right, # equally likely the opposite: right to left, or both ways.) # - The last split on Agriculture is not very believable as it has the # unexpected direction: higher Agriculture, lower Fertility; # But then the difference is the smallest of any split in the tree, # which adds evidence that the split is probably not "real". # Overall, though, the standard tree function gave us in this case a # very interpretable tree. This may be due to the small size of the # dataset. # Our own CART routine does not exactly reproduce the tree generated # by the standard library (there are differences in the code): swiss.CART <- regression.tree(swiss[,c(2:5,1)], method="CART", minsize=5, title="Swiss Fertility Data") show.tree(swiss.CART, type="u", cex=1) # The tree is interesting because it seems to suggest that # Agriculture does in fact have an unexpected correlation with # fertility after adjustment for Education and Catholic! # Now we use another option to look for high Fertility buckets: swiss.XM <- regression.tree(swiss[,c(2:5,1)], method="XM", minsize=5, title="Swiss Fertility Data") show.tree(swiss.XM, type="u", cex=1) # We see that for finding high Fertility, one should look for # low Examination above all and high Catholic next. # Recall that Examination is an education proxy, so it is surprising # that deeper down Education also appears. # Overall we learn that for high Fertility we should look for the # lowest levels of the education proxies Examination and Education. # Next we look for low fertility: swiss.NM <- regression.tree(swiss[,c(2:5,1)], method="NM", minsize=5, title="Swiss Fertility Data") show.tree(swiss.NM, type="u", cex=1) row.names(swiss)[swiss[,"Education"] > 23.2] # Again, at the top a high Education bucket is broken off, # followed by a high Examination bucket, # followed by two buckets with low Catholic. # # Interesting: # There are two splits on Agriculture, once again in the unexpected # direction: high Agriculture - low Fertility. # Do we have to conclude that all things being equa (educ., religion), # agriculture in 1888 was such a taxing environment that it # depressed fertility? # Note that we say "all things being equal", which means: # "after adjusting for other variables", # or equivalently for trees: # "after forming buckets with small variability in the other # variables". # Let's use a linear model fit to see whether the overall qualitative # behaviors are confirmed, in particular that of Agriculture: summary(lm(Fertility ~ ., data=swiss)) # (We'll say more about the linear model function "lm" later.) #------------------------ output begin ... Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 66.91518 10.70604 6.250 1.91e-07 *** Agriculture -0.17211 0.07030 -2.448 0.01873 * Examination -0.25801 0.25388 -1.016 0.31546 Education -0.87094 0.18303 -4.758 2.43e-05 *** Catholic 0.10412 0.03526 2.953 0.00519 ** Infant.Mortality 1.07705 0.38172 2.822 0.00734 ** ... #------------------------ output end # Yes, Agriculture has a significant negative slope! # # Note another feature of the linear model, though: # Examination is insignificant! # After looking at the trees and the plots we know better than that. # Let us remind ourselves of a couple of plots: windows(width=7, height=14) par(mfrow=c(2,1), mgp=c(1.5,.5,0), mar=c(2.5,2.5,1,1)) plot(swiss[,c("Examination","Fertility")], pch=16) plot(swiss[,c("Education","Fertility")], pch=16) # Education comes in so strong because of the 7 high Education # provinces. By comparison, Examination has a weaker but more robust # correlation with Fertility: no 7 provinces would account for the # correlation. ################ # # Back to the slides for class prob trees ################################################################ ################################################################