# ================================== STAT 926 NOTES =================================== ## Mechanics of this script: ## - Lots of R code, for you to replicate and cannibalize. ## - For syntax highlighting, the file has extension '.R'. ## - The instructor uses the Emacs editor in which he runs R ## for ease of code execution. ## You can use any environment of your choice. ## Let the instructor know what the latest in environments is. ## R-Studio? ## - Much of the explanatory text is in R comments. ## - This is an evolving document. ## Feel free to get the initial 2012 version from your older colleagues, ## but there is no guarantee that this course will be identical ## to 2012. In this second run of the course, the instructor ## hopes to cover more material than in 2012. ## - Plan: We start with some R basics, ## then proceed with an overview of multivariate data visualization, ## and finally we'll spend much of the semester with ## concepts of multivariate analysis, ## idiosyncratically selected by the instructor. ## Suggestions for topics are welcome. #================================================================ # R SHORTCUTS, PACKAGES, LIBRARIES AND INSTRUCTOR'S FUNCTIONS: # Sheer laziness: len <- length su <- function(x) sort(unique(x)) cn <- colnames rn <- rownames # Caution! Do you think this will work? mat <- cbind(1:3,4:6); cn(mat) <- c("A","B") repos <- "http://cran.us.r-project.org/" # For installing packages install.packages("vcd", repos=repos) install.packages("YaleTookit", repos=repos) # Ooops, not avaiable source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/tab-all.R") source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/mosaic.R") source("http://stat.wharton.upenn.edu/~buja/STAT-541/plot-plus.R") source("http://stat.wharton.upenn.edu/~buja/STAT-541/pairs-plus.R") .First <- function() { # Gets called on startup of R, ... library(MASS) # library(lattice) # ... so these libraries are ready to use. library(vcd) # But you can do many other things in .First as well. options(width=200) } save.image() # So the functions are available next time you start up R. # Examine what packages you got: installed.packages() # Available packages: pckgs.all <- available.packages("http://cran.r-project.org/bin/windows/contrib/3.2") str(pckgs.all) # 2012/09/11: 3956 packages; 2013/09/04: 4723; 2014/09/03: ... grep("ace", rownames(pckgs.all), val=T) # If you remember part but not all of a name. grep("norm", rownames(pckgs.all), val=T) # If you remember part but not all of a name. install.packages("mnormt", repos=repos) install.packages("mvtnorm", repos=repos) install.packages("acepack", repos=repos) # Examine what datasets you got: data() # Note some datasets come with packages. # PS: Interesting recent NYTimes article # "For Big-Data Scientists, ¡®Janitor Work¡¯ Is Key Hurdle to Insights" # http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html #---------------------------------------------------------------- # TOY DATA: # Laser data (courtesy of P. Tukey, Bellcore 1987): laser <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/laser.dat", header=T) str(laser) # laser data: 2 inputs (current 1 and 2), 2 outputs (energy, wavelength) tab.all(laser) # FTSE stock data, 1991-05-13 till 2006-06-11 (courtesy of Kartik Ghia): ftse <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/FTSE-financials.csv") str(ftse) # Autism phenotype datasets: not on webpage autism.cdv <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/proband_core_descriptive_variables.csv") str(autism.cdv) # "cdv" = "core descriptive variables" autism.cuv <- read.csv("http://stat.wharton.upenn.edu/~buja/DATA/proband_commonly_used_variables.csv") str(autism.cuv) # "cuv" = "commonly used variables" # Italian olive oil data (from various R packages): olive <- read.csv("http://www-stat.wharton.upenn.edu/~buja/DATA/olive.csv") str(olive) # olives: 8 fatty acid contents, 3 regions, 9 subregions of Italy scan("http://rss.acs.unt.edu/Rdoc/library/classifly/html/olives.html", w="", sep="\n") # docu # Places Rated data (1980s): places <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/places.dat") scan("http://stat.wharton.upenn.edu/~buja/DATA/places.README", w="", sep="\n") # docu str(places) # Boston Housing data (Belseley-Kuh-Welsch, Regression Diagnostics): data(Boston); help(Boston) Boston <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/boston.dat", header=T) # Geography of the census tracts: rownames(Boston) <- paste(scan("http://www-stat.wharton.upenn.edu/~buja/DATA/boston.geography", w=""), 1:nrow(Boston)) # Iris data: NOT Fisher's iris data", rather: Edgar Anderson's iris data, as reanalyzed by R.A. Fisher data(iris); help(iris) # as a dataframe data(iris3) # as a 3-d array save.image() #================================================================ BASIC R GRAPHICS: * Fundamental plots: - Univariate: . quantitative: histogram density plot (violin plots) boxplot (one box) . categorical: barplot/chart dotplot pie chart - Bivariate: . both quantitative: scatterplot . both categorical: divided or side-by-side barplot/chart mosaic plot . mixed, case 1: quantitative Y conditional on categorical X boxplot (one box per category) . mixed, case 2: categorical conditional on quantitative ???(running conditional proportions?) HW: Is there a convincing solution to the last case in the lit? - Trivariate: . heatmaps (2 X forming a rectangular grid, 1Y) . contour plots (2X, 1Y) (shows a 2-D smooth, really) . glyph (plotting character/symbol) coding of Y, plot X1,X2 . 3D rotations (is in JMP, e.g.) - Multivariate: . scatterplot matrices ('splom') . mosaic plots (multivariate) . parallel coordinate plots . lattice/trellis displays, coplots, conditional plots ==> Data visualization (as opposed to ... medical 3D vis, network vis, ...) * Adding to fundamental plots: lines() points() abline() rect() polygon() text() mtext() Extreme case of building up your own plot: plot(..., type="n") # Set up a plot panel with axes but plot nothing Then use the above functions to build up your plot. * Making up your own axis: plot(..., xaxt="n") # No x-axis axis(..., side=1) # Fill the arguments for tic locations and labels #---------------------------------------------------------------- # DEFAULTS: Much of R plots waste space with too much margin # ==> Move the axes labels closer and narrow the margin. par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) # Call everytime you create a new window !!!!!! #---------------------------------------------------------------- # TIPS DATA: # Tips data (Di Cook, Iowa Stat U): # http://stat.wharton.upenn.edu/~buja/STAT-541/tips.html tips <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/tips.dat", header=T) # First glance: str(tips) tab.all(tips) # Draw conclusions from the displays below. The data are quite neat! # There is a 'theory of social behavior' regarding tipping that we all know. #---- 1D PLOTS # HISTOGRAMS: A curiosity about moneys: there are preferred values hist(tips[,"TIP"], col="gray") hist(tips[,"TIP"], col="gray", breaks=10000, xlab="Tip", main="Tips Data") # ==> ALWAYS make a histogram with narrow bins as well (breaks=10000) !!!! # You will see the values and their ties!!! # Out of curiosity: Is there a rounding effect in total bill? hist(tips[,"TOTBILL"], col="gray", breaks=10000, xlab="TotBill", main="Tips Data") # What about total paid? hist(tips[,"TOTBILL"]+tips[,"TIP"], col="gray", breaks=10000, xlab="Tot+Tip", main="Tips Data") abline(v=c(10,15,20,25,30,35)+.1, col="red") # Add a dime so as not to overplot... # Histogram function in 'lattice': histogram(tips[,"TIP"]) # Less wasteful margins than 'hist()', no need for par(mgp=..., mar=...) histogram(~ TOTBILL, data=tips) # Better labeling with formula language histogram(~ TOTBILL, data=tips, main="Tips Data") # Title histogram(~ TOTBILL, data=tips, main="Tips Data", breaks=1000) histogram(~ I(TOTBILL+TIP), data=tips, main="Tips Data", breaks=1000, panel=function(...) { panel.abline(v=c(10,15,20,25,30,35), col="red", lwd=3, alpha=.2) panel.histogram(...) } ) # ==> Panel function permits drawing of abline decorations before main content. # It also has an alpha-blending parameter (beween 0 and 1) for transparency. # 'Drawback': May seem more cumbersome to add to a plot; # However: You will see the necessity below for conditional plots/coplots/trellis plots. # OTHER 1D DISPLAYS, just so you know: bwplot(~ TIP, data=tips) # A single-box 'boxplot' or 'box-and-whiskers plot' densityplot(~ TIP, data=tips) # Kernel density plot densityplot(~ TIP, data=tips, plot.points="rug") # Aesthetics: Rug instead of jittered points densityplot(~ TIP, data=tips, plot.points=F, ref=T) # Aesthetics: No points but reference line bwplot(~ TIP, data=tips, panel=panel.violin) # Variation on density plots: violin plots # NOTE ON HISTOGRAM AND DENSITY PLOTS: # For theoreticians the choice of 'bin width'/'bandwidth' in density estimation is a big deal. # In practice you should play with many choices, in particular very small widths; # they might show different aspects of the data. #---- 2D PLOTS # SCATTERPLOT: Several ways of plotting two variables of primary interest plot(tips[,"TOTBILL"], tips[,"TIP"], xlab="Total Bill", ylab="Tip", pch=16) plot(tips[,c("TOTBILL","TIP")], pch=16) plot(TIP ~ TOTBILL, data=tips, pch=16) # model formula: Y ~ X abline(a=0, b=.15, lwd=2, col="blue") # Can add a 15% line to any of the above abline(h=seq(.5,6,by=.5), col="gray") xyplot(TIP ~ TOTBILL, data=tips) # library(lattice) xyplot(TIP ~ TOTBILL, data=tips, pch=16, col="black") # library(lattice) # Critique of the previous three lines: The decoration lines overplot the main info, the dots. # This can be most easily corrected in the 'lattice' version: xyplot(TIP ~ TOTBILL, data=tips, pch=16, col="black", panel=function(...) { panel.abline(h=seq(.5,6,by=.5), col="gray") panel.abline(a=0, b=.15, lwd=2, col="blue") panel.xyplot(...) } ) # Important concept in 'lattice': the panel function! # It can assume that something like coordinates and axes have been provided. # 2 variables, X categorical ('is.factor'), Y quantiative ('is.numeric'): # COMPARISON BOXPLOT plot(x=factor(tips[,"SEX"]), y=tips[,"TIP"], xlab="SEX", ylab="TIP") # 'base' plot(TIP ~ factor(SEX), data=tips) # 'base' with formulas xyplot(TIP ~ factor(SEX), data=tips) # 'lattice': nope, just a scatterplot bwplot(TIP ~ factor(SEX), data=tips) # Comparison boxplot bwplot(TIP ~ factor(SEX), data=tips, panel=panel.violin) # Comparison boxplot # Side note on 'generic functions': # - plot() in 'base' is a 'generic function'; # that is, it is really many functions ('methods') # and which are called depending on the type fo the arguments. # ==> Essential feature of 'object-oriented programming' ('OO'). # - You can find out what methods for a generic function exist # by asking with 'methods()': methods(plot) # Another generic function: print(), methods(print) #---- COPLOTS (HIGH-D SECTIONS) # COPLOTS/CONDITIONAL PLOTS/LATTICE PLOTS/TRELLIS PLOTS: # condition on the categories of one or more factors # We will condition on SEX and SMOKER: first the hard way with 'base' graphics ... windows(width=8, height=8) # Aesthetics of aspect ratios: roughly squares par(mfrow=c(2,2), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) # Aesthetics of axes & /margins for(sex in 0:1) for(smo in 0:1) { sel <- tips[,"SEX"]==sex & tips[,"SMOKER"]==smo plot(tips[sel,"TOTBILL"], tips[sel,"TIP"], xlim=c(0,55), ylim=c(0,10), # Force round numbers on the ranges xlab="TOTBILL", ylab="TIP", # Just to show you can overrule var labels main=paste(c("Male,","Female,")[sex+1], # Choose main titles for the frames. c("Non-Smokers","Smokers")[smo+1]), type="n") # Plot nothing, only set up the panel. abline(a=0, b=.15, col="blue") # The 15% tip line; quibble: it overplots... points(tips[sel,"TOTBILL"], tips[sel,"TIP"], # Plot main contain AFTER the line! pch=20, cex=1.2) # Aesthetics of plotting glyphs/symbols } # A high-level function for the same (in 'base'): coplot(TIP ~ TOTBILL | factor(SMOKER)*factor(SEX), data=tips, pch=16) coplot(TIP ~ TOTBILL | factor(SMOKER)*factor(SEX), data=tips, pch=16, panel=function(...) { abline(a=0, b=.15, col="gray"); points(...) }) # Finally coplots in 'lattice': xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips, pch=16) # library(lattice) xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips, pch=16, col="black", panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } ) # Sorry, this calls for better labeling, which is achieved # by using factors instead of numeric dummy variables: tips.df <- tips # Is a dataframe tips.df[,"SEX"] <- factor(tips[,"SEX"], levels=c(0,1), labels=c("Male","Female")) tips.df[,"SMOKER"] <- factor(tips[,"SMOKER"], levels=c(0,1), labels=c("Non","Smoker")) tips.df[,"DAY"] <- factor(tips[,"DAY"], levels=3:6, labels=c("Wed","Thu","Fri","Sat")) coplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips.df, pch=16) xyplot(TIP ~ TOTBILL | SMOKER*SEX, data=tips.df, pch=16, col="black", panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } ) # Condition even more: layout problems # In the following first version multiple pages are drawn, which disappear till the last. xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black", panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } ) # We need to squeeze the plot into one page with 'layout' and make smaller labels with 'cex' xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black", layout=c(4,4), panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } ) xyplot(TIP ~ TOTBILL | SMOKER*SEX*DAY, data=tips.df, pch=16, col="black", layout=c(4,4), par.strip.text = list(cex = 0.75), aspect=1, panel=function(...) { panel.abline(a=0,b=.15, col="gray"); panel.xyplot(...) } ) # Aesthetics: # - 'layout=c(4,4)' produces 4 columns by 4 rows, filled row-major. # - 'par.strip.text' has parameters for sizing the label strips (see help for more). # - 'aspect=1' produces an aspect ratio of the panels so slopes are on average +1. # ==> Learn about Cleveland's 'banking rule' !!!! # Displays should have aspect ratios that show the slopes of interest # at roughly 45 degrees. # Conditioning can be applied to any of the 'lattice' plot types: bwplot(TIP ~ factor(DAY) | SEX, data=tips) # Comparison boxplot for SEX bwplot(TIP ~ factor(DAY) | SEX, data=tips, aspect=1) # Better aesthetics? Maybe not; why? bwplot(TIP ~ factor(DAY) | SEX*SMOKER, data=tips.df, main="TIP conditioned on DAY") # CONDITIONING CREATES HIGH-DIMENSIONAL DISPLAYS!!! # What is the dimensionality of the last coplot as a whole, # that is, taking all four panels together? #---- HIGH-D PROJECTIONS # SCATTERPLOT MATRICES/SPLOMS: pairs(tips) # Hm... plot(tips) # Same: 'tips' is a dataframe, so plot() => plot.data.frame() splom(tips) # 'lattice': messy, would need to learn sizing labels pairs.plus(tips) # Instructor's version with automatic jitters and more space around point pairs.plus(tips, jitter=0.1, gap=.5) # (To show how to control jitter and gap) # Use color: tips.col <- c("black","red","blue","green")[2*tips[,"SEX"]+tips[,"SMOKER"]+1] # ==> black = male non smoker # red = male smoker # blue = female non smoker # green = female smoker pairs.plus(tips.df, gap=.5, col=tips.col, jit=.2) # Add color # For sploms, create a new version with jittered discrete variables 3:6: tips.jit <- sapply(1:ncol(tips), function(j) if(j<3) tips[,j] else jitter(tips[,j], factor=.5)) colnames(tips.jit) <- colnames(tips) splom(tips.jit, pch=16, col=tips.col, cex=.5, alpha=.5) # Thanks, Josh! # Aesthetics of labels and axes: splom(tips.jit, pch=16, col=tips.col, varname.cex=.8, axis.text.cex=.5, axis.line.tck=.5, pscales=4) # Coplot version: splom(~cbind(tips[,c("TOTBILL","TIP")],DAY=jitter(tips[,"DAY"])) | SEX*SMOKER, data=tips.df) splom(~cbind(tips[,c("TOTBILL","TIP")],DAY=jitter(tips[,"DAY"])) | SEX*SMOKER, data=tips.df, pch=16, varname.cex=.8, axis.text.cex=.5, axis.line.tck=.5, pscales=4, xlab="") # Aethetics! # Q: How many data dimensions are you 'looking at' in each display? # PARALLEL COORDINATE PLOTS: par(mfrow=c(1,1)) parcoord(tips) # 'MASS' parallelplot(tips) # 'lattice', random colors by default parallelplot(tips, horizontal.axis=F) # left-to-right parallelplot(tips.jit, horizontal.axis=F, col=tips.col) # meaningful colors # Coplot version enabled by 'lattice': parallelplot(~ tips.jit[,-c(3,4)] | SEX*SMOKER, data=tips.df, horizontal.axis=F, varname.cex=.8, groups=DAY) # Selects default colors for 'groups' parallelplot(~ tips.jit[,-c(3,4)] | SEX*SMOKER, data=tips.df, horizontal.axis=F, varname.cex=.8, groups=DAY, col=c("red","purple","blue","green")) # Choose your own colors for groups. # COLORS in 'lattice': # - This library tries to provide good default colors. # You may or may not agree. # - Colors for cases are conveyed by using the argument # 'groups=...". Default colors will be chosen. # - If you want to use your own colors, use in addition the argument # 'col=...". Its values will be matched to 'groups=...'. # - If you are NOT CONDITIONING, then just using 'col=...' will work # just like in 'base' graphics. # In 'lattice' IT WILL NOT WORK PROPERLY IF YOU CONDITION!!!!!! # - Unfortunately 'lattice' allows only one grouping variable # in 'groups=...', so if you intend to color according to # SEX*SMOKER, you have to create this variable first. #---------------------------------------------------------------- # NOTE ON AESTHETICS: # - The instructor's principle is that the main info should stick out # the most, which is why he forces the glyphs to be black and solid. # - 'lattice' graphics force us to add plot elements in the panel function. # This may seem like a nuisance but has the advantage that # we can plot decorations such as ablines before the main plot. # The reason is that 'lattice' sets up a plotting panel before # the panel function is called, hence axes have been created. # - Meta-principles regarding aesthetics: # . You should rarely be satisfied with the defaults of plotting software. # . You don't have to have the instructor's aesthetics, # but do have SOME aesthetics, that is, principles by which you plot. # It means that you can answer questions as to why the details of # a plot look the way they look. #---------------------------------------------------------------- # HISTOGRAMS: CORPORATE EARNINGS DATA -- NARROW INS # When I taught this course in Spring of 2004, a student in # accounting, Joseph Gerakos, showed us this article: # D. Burgstahler, I. Dichev, # "Earnings management to avoid earnings decreases and losses," # Journal of Accounting and Economics 24 (1997) 99-126. # Joseph Gerakos prepared for us a dataset that replicates # the Burgstahler-Dichev effect: Check data... burg.dich <- read.csv("http://www-stat.wharton.upenn.edu/~buja/DATA/burg_dich.csv", header=T) # First glance: str(burg.dich) tab.all(burg.dich) summary(burg.dich) # Plot Earnings earnings <- burg.dich[,"earnings"] # We find some strangely heavy tails: par(mgp=c(1.5,.5,0), mar=c(3,3,2,1)) hist(earnings, col="gray", breaks=100) # Where are these extreme negative values? List them: sort(earnings)[1:20] # Lower end. rev(sort(earnings))[1:20] # Upper end. # Based on the extremes, let's limit earnings to +-10: sel <- (-10) < earnings & earnings < 10 hist(earnings[sel], col="gray", breaks=100) # Still, very heavy-tailed. Subselect the middle 80% of earnings: lower <- quantile(earnings, 0.1) # Lowest decile. upper <- quantile(earnings, 0.9) # Highest decile. earnings.sub <- earnings[lower < earnings & earnings < upper] hist(earnings.sub, col="gray") # Ok, we like to see this: earnings have a mode in the positive range. # But there is left-skewness, which means that there are many more # extreme negative earnings than extreme positive earnings: # Earnings can drop faster than rise, which makes sense. # To get back to our story: let's use unreasonably narrow bins. hist(earnings.sub, breaks=100, col="gray") # Or even more extreme: hist(earnings.sub, breaks=1000, col="gray") abline(v=0, col="red") # Here we go! Companies try to avoid reporting negative earnings # if at all possible. If the earnings are only slightly negative, # there seem to exist ways to 'manage' (cook?) the books somewhat. # According to Joseph, a small specialty in accounting has emerged # that examines countries according to whether they show signs of # earnings management. # # Or is this so? A year later the accounting students in 2005 said # that there is now some debate in the literature about the causes of # the dip and spike near zero. It could potentially be an artifact of # tax laws. What will the students of 20xx report??? # # More recently, Sarah Zechman, Ph.D. student in Accounting who took # Stat 541 in 2007, showed me a more recent article on this topic. # Aparently, the dip-and-spike disappear if one computes earnings # not for reporting periods but for periods that are off the reporting dates. # So, indeed, we see human action at work: when it comes to reporting, # "earnings management" is done, throughout the year it isn't. #================================================================ # TITANIC DATA: BARCHARTS AND MOSAIC PLOTS # Titanic data: # - The tabulated form: contingency table data(Titanic); help(Titanic) # - Raw form: dataframe titanic <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/titanic.dat", header=T) str(titanic) # [Note: table(titanic) is not identical with 'Titanic' # due to different ordering of the categories.] dimnames(table(titanic)) dimnames(Titanic) #---- # Barplot/chart functions barplot # 'base' barchart # 'lattice' # First experiments with barplot(): barplot(table(titanic[,"Class"])) # From dataframe barplot(margin.table(Titanic,1)) # From contingency table barplot(table(titanic[,c("Sex")])) # From dataframe barplot(margin.table(Titanic,2)) # From contingency table; reverse ordering # Same with barchart: barchart(table(titanic[,"Class"])) # Oops, why this default? barchart(margin.table(Titanic, 1)) # Dito barchart(table(titanic[,"Class"]), horizontal=F) # The 'lattice' default for horizontal bars is visually not good for bar comparison: # We are better at vertical height comparisons than horizontal width comparisons. # Reason: Our vision is trained for detail on a horizontal horizon. # The only reason for horizontal bars would be legibility of horizontal labels. # Horizontal labels on the x-axis get crowded if there are many, # and vertical labels cause our heads to tilt for reading. #---- # 2D BARCHARTS: DIVIDED AND NESTED BARCHARTS: # - DIVIDED BARCHARTS: barplot() accepts matrices # and uses their columns for 'stacked/divided barcharts': barplot(table(titanic[,c("Survived","Class")])) # From dataframe barplot(margin.table(Titanic,c(4,1))) # From contingency table # Need a legend to make sense of what is being plotted: barplot(table(titanic[,c("Survived","Class")]), legend.text=T) # Ooops # Aesthetics: par(mar=c(3,3,5,2)) barplot(table(titanic[,c("Survived","Class")]), col=c("gray70","gray90"), main="Survival on the Titanic by Class", legend.text=c("perished","survived"), args.legend=list(x="topleft") ) # # - NESTED BARCHARTS: barplot(table(titanic[,c("Survived","Class")]), beside=T) barplot(table(titanic[,c("Survived","Sex")]), beside=T) # - Comparison of divided and nested: par(mfrow=c(2,2), mgp=c(1.5,.5,0), mar=c(3,3,2,2), oma=c(0,0,3,0)) # Outer margin for title!!! barplot(table(titanic[,c("Survived","Class")]), legend=T, args.legend=list(x="topleft")) barplot(table(titanic[,c("Survived","Sex")])) barplot(table(titanic[,c("Survived","Class")]), beside=T) barplot(table(titanic[,c("Survived","Sex")]), beside=T) title("Divided and Nested Barcharts", outer=T, cex.main=1.5) # Title in outer margin # Divided barcharts: # . Facilitated comparisons: # + overall category counts # + first subcategory counts (here: perished) # + NOT: second and other subcategories (because they are shifted) # . Aesthetics are simple, fewer plotted bars # Nested barcharts: # . Faciliated comparisons: # + all subcategory counts # + NOT: overall category counts # . Aesthetics are more complex, more plotted bars #---- # COPLOTS WITH BARCHARTS: # barchart() in 'lattice' gives you much flexibility for conditionaing and layout: # refine... barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic)) # The instructor has no idea what the bar dividers are... Anyone? # Another problem: baseline is negative .... !!!! BUG !!! # This may be fixed in your possibly newer version of R. # Nested/grouped barcharts: nesting is by 'groups=...' barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived) # default: 'stack=F' # Divided/stacked barcharts: dividing is by 'groups=...' barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T) # With legend: Justin the stickler -- No/Yes vs Yes/No ... how invert? barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T, auto.key=list(title="Survived") ) # Legend horizontal in 2 columns: barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T, auto.key=list(title="Survived", columns=2) ) # Force four plots into a row: barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T, auto.key=list(title="Survived", columns=2), layout=c(4,1)) # Play with aspect ratio: barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T, auto.key=list(title="Survived", columns=2), layout=c(4,1), aspect=1.5) # Allow separate scales on the vertical axes: barchart(Freq ~ Class| Sex + Age, data=as.data.frame(Titanic), groups= Survived, stack=T, auto.key=list(title="Survived", columns=2), layout=c(4,1), scales=list(y="free") ) # Final question: How-Many-D are these plots: #================================================================ # MOSAIC PLOTS: # - The standard R function (Scott Emerson at Yale): par(mfrow=c(1,1)) mosaicplot(Titanic) # How on earth do you decipher this? mosaicplot(~ Sex + Class + Survived, dir=c("v","v","h"), data=Titanic, off=1) help(mosaicplot) # - Rule: The most interpretable mosaic plot has one vertical variable and # one or more nested horizontal variables # ==> 'doubledecker plot' (Heike Hofmann) # - The 'vcd' package has a doubledecker plot: library(vcd) doubledecker(Survived ~ Sex + Class, data=Titanic) # This is a mosaic plot with one y variable (vertical), # and possibly several nested x variables (horizontal). # This is the mosaic plot that makes the most sense. # - The instructor's mosaic plot: detach("package:vcd") # conflict with package 'vcd' source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/mosaic.R") mosaic(titanic[,c("Sex","Class","Survived")]) # from dataframe mosaic.plot(Titanic, dims=c("Sex","Class","Survived")) # from contingency table (array) # - 'lattice': No mosaic plots apparently... # - Compare barcharts with mosaic plots: # . Barcharts compare conditional counts # . Mosaic plots compare conditional proportions # - Mosaic matrices: # If all variables are categorical, # plot all pairwise mosaic plots. # (Heike Hofmann cannibalized the instructor's mosaic function.) mosaic.matrix(titanic) # dataframe mosaic.matrix(Titanic) # contingency table # Q: Are symmetric cells the 45 degree reflections of each other? #---------------------------------------------------------------- # MARKETING DATA: PROFILING CUSTOMER SEGMENTS # Telecom marketing data (instructor's source): mktg <- read.table("http://www-stat.wharton.upenn.edu/~buja/DATA/mktg.dat", header=T) # First glance: str(mktg) # 3 quantitative variables for customer intensity; 5(6) categorical covariates tab.all(mktg) # Will need a version with categorials coded as factors: mktg.df <- mktg mktg.df[,"biz"] <- factor(mktg[,"biz"], levels=c(0,1), labels=c("biz.No", "biz.Yes")) mktg.df[,"reach"] <- factor(mktg[,"reach"], levels=c(0,1), labels=c("reach.No","reach.Yes")) mktg.df[,"kids"] <- factor(mktg[,"kids"], levels=c(0,1), labels=c("kids.No", "kids.Yes")) tmp <- su(mktg.df[,"age"]) # su <- function(x) sort(unique(x)) mktg.df[,"age"] <- factor(mktg[,"age"], levels=tmp, labels=paste("age",tmp,sep=".")) tmp <- su(mktg.df[,"edu"]) mktg.df[,"edu"] <- factor(mktg[,"edu"], levels=tmp, labels=paste("edu",tmp,sep=".")) tmp <- su(mktg.df[,"seg"]) mktg.df[,"seg"] <- factor(mktg[,"seg"], levels=tmp, labels=paste("seg",tmp,sep=".")) str(mktg.df) # HISTOGRAMS: windows(width=3, height=10) par(mfrow=c(ncol(mktg),1), mgp=c(1.5,0.5,0), mar=c(3,3,2,1)) # Shrink waste... for(i in 1:ncol(mktg)) hist(mktg[,i]) # Looks nasty. # Refine and improve: 2 columns, one with normal bins, one with narrow bins. windows(width=8, height=10) par(mfrow=c(ncol(mktg),2), mgp=c(1.5,.5,0), mar=c(2,3,1,1)) for(i in 1:ncol(mktg)) { hist(mktg[,i], main=names(mktg)[i], # default bin widths xlab="", ylab="", col="gray", cex.main=1, cex.axis=.8) hist(mktg[,i], main=names(mktg)[i], breaks=1000, # narrow bin widths xlab="", ylab="", col="gray", cex.main=1, cex.axis=.8) } # The display: # Can you find a more high-level way of doing this with 'lattice', say? # The data: # The narrow bin widths indicate how 'in.' and 'out' may have # been generated (transformations of counts). # SCATTERPLOT MATRIX: Grand overview pairs(mktg) pairs(mktg, gap=.1, pch=16, cex=.5) plot(as.data.frame(mktg), gap=.1, pch=".", cex=.5) # Instructor's function: jitter variables with few values, adds space around points pairs.plus(mktg) # source(....) # see above pairs.plus(mktg, gap=.1, pch=".") #---- # MOSAIC PLOTS: # While profiling the segments was the primary practical concern # for this dataset, one should also be interested in the relationship # between the observed variables ('in.',...,'edu'). # A variable of primary interest to a business is the revenue from each # custormer, hence we may want to see whether there exist differences in # revenue between groups defined in terms of 'bus','reach','kids'. # Here is a histogram-based coplot of the 8 groups defined by all # possible combinations of levels of these three categorical variables: # Marketers should keep in mind how little bottom line differences # there are. #---- # - Sometimes it is worthwhile to discretize a continuous variable # to a categorical variable, if for no other reason than making # it accessible to the power of mosaic plots. # # (If discretizing turns continuous into categorical variables, # recall that jittering did somewhat the opposite: turning # categorical/ordered variables into continuous ones.) # # In the marketing data, we will do the following: # * We will treat "revenue" as a response and try to describe its # association with other variables: "reach", "kids", "bus", "age", and "edu". # To this end, we will break "revenue" into 3 equal brackets. # * We will then use mosaic plots to profile the segments in terms of the # other categorical variables, including discretized "revenue". # # - We ignore incoming/outgoing calls and discretize "revenue" # to a categorical variable. # Hand-craft a contingency table with self-explanatory labels: # # 1) Break "revenue" into 3 categories according to terciles: mktg.rev <- cut(mktg[,3], breaks=c(-Inf, quantile(mktg[,3],c(1/3,2/3)), Inf), labels=c("-rev","~rev","+rev")) # The function 'cut()' is meant for such purposes. One gives it the # vectors of breaks (here computed as terciles # 2) Convert the categorical variables (coded by positive integers # in columns 4 through 9 in 'mktg') to factors with self-explanatory # labels for the categories. For example mktg.bus <- factor(mktg[,4], levels=0:1, labels=c("-bus","+bus")) mktg.reach <- factor(mktg[,5], levels=0:1, labels=c("-reach","+reach")) mktg.kids <- factor(mktg[,6], levels=0:1, labels=c("-kids","+kids")) mktg.age <- factor(mktg[,7], levels=0:4, labels=paste("age",1:5,sep="")) mktg.edu <- factor(mktg[,8], levels=0:6, labels=paste("edu",1:7,sep="")) mktg.seg <- factor(mktg[,9], levels=1:4, labels=paste("seg",1:4,sep="")) # Now throw all these factors into the tabulation: mktg.tab <- table(rev=mktg.rev, bus=mktg.bus, reach=mktg.reach, kids=mktg.kids, age=mktg.age, edu=mktg.edu, seg=mktg.seg) # This table is quite high-dimensional: dim(mktg.tab) # This shows that we have 7 dimensions with the following number of # groups in each dimension: 3, 2, 2, 2, 5, 7, 4, # The following shows the names of the levels/groups/categories for # each dimension: dimnames(mktg.tab) # # For a table this size, plotting all dimensions doesn't work. # Here is more clarity: # - Look at the 'rev' as a function of 'kids' within 'reach' within 'bus': mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev")) # - It may make sense to a few different nestings to see potential patterns # in how 'rev' depends on 'bus', 'reach', 'kids': windows(15,10) par(mfrow=c(2,3)) mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev"), cex=.8) mosaic.plot(mktg.tab, dims=c("reach","bus","kids","rev"), cex=.8) mosaic.plot(mktg.tab, dims=c("bus","kids","reach","rev"), cex=.8) mosaic.plot(mktg.tab, dims=c("kids","bus","reach","rev"), cex=.8) mosaic.plot(mktg.tab, dims=c("reach","kids","bus","rev"), cex=.8) mosaic.plot(mktg.tab, dims=c("kids","reach","bus","rev"), cex=.8, mar=c(0,0,2,0),main="gaga") # The most patterned plot may be the last: it shows that # the fraction of high-revenue households (blue bars) increases and # the fraction of low-revenue households (brown bars) decreases # . from '-bus' to '+bus', that is, with business needs, # . from '-reach' to '+reach', that is, with reach needs, # . both for '-kids' and '+kids', that is, households with and without children. # It seems that business and reach needs are slight drivers of high revenue. # # The Hartigan-Kleiner mosaic plots: turns out to be useful! windows() mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev"), hor.vert=c("h","v","h","v"), cex=.8, lab.dist.vert=.2, border=T) # Interesting observation: # - The four large blocks are broken almost cross-wise. # - This means that all proportions are roughly equal across the # 'reach' and 'bus' conditions. # - This in turn means that 'rev' and 'kids' are conditionally independent # given 'reach' and 'bus'. # # However, there are slight differences in proportions of # high/low revenue between '-kids' and '+kids' when one compares # the bottom left quarter with the top right quarter: # - for '-reach' and '-bus', adding kids seems to add to revenue, but # - for '+reach' and '+bus', adding kids seems to take away from revenuel # # (You might wonder whether we aren't over-interpreting the data by # reading meaning into such small differences in proportions. # Indeed, this kind of situation calls for inferential statistics # to tell us whether the differences are more likely to be due to randomness.) # # Next, how do age and education affect (better: relate to) revenue? # Education: mosaic.plot(mktg.tab, dims=c("edu","rev")) # The lowest two education groups are almost empty, so we remove them: mosaic.plot(mktg.tab[,,,,,-c(1,2),], dims=c("edu","rev")) # Marginally: the more education, the more high revenue, the less low revenue. # Age: mosaic.plot(mktg.tab, dims=c("age","rev")) # It would help to know the age brackets, but I don't. # Apparently, very young ('age1') and young mature ('age3') are the # strongest revenue producers for the company. # # Stratify by edu nested within age: Remove the lowest 3 education groups first. mosaic.plot(mktg.tab[,,,,,-(1:3),], dims=c("age","edu","rev")) # Education shows an effect on revenue for all age groups except 'age3'. # Profiling the segments: windows(width=15, height=10) par(mfrow=c(2,3), cex=.6) mosaic.plot(mktg.tab, dims=c("seg","bus"), lab.dist.hor=.2, lab.dist.vert=.2) mosaic.plot(mktg.tab, dims=c("seg","reach"), lab.dist.hor=.2, lab.dist.vert=.2) mosaic.plot(mktg.tab, dims=c("seg","kids"), lab.dist.hor=.2, lab.dist.vert=.2) mosaic.plot(mktg.tab, dims=c("seg","age"), lab.dist.hor=.2, lab.dist.vert=.2) mosaic.plot(mktg.tab[,,,,,-(1:3),], dims=c("seg","edu"), lab.dist.hor=.2, lab.dist.vert=.2) mosaic.plot(mktg.tab, dims=c("seg","rev"), lab.dist.hor=.2, lab.dist.vert=.2) # We knew all these things, didn't we? # Segments 2 and 4 are low revenue. # # More broken down, but segments vertical for colors: windows(width=15, height=15) mosaic.plot(mktg.tab, dims=c("bus","reach","kids","rev","seg")) # What do you see? Note that segments 1,2,3,4 have assigned # colors brown, green, blue, purple in this order # #---- # A couple of thought about this type of survey data: # - Who is willing to fill out questionnaires for a small sum of money? # => ... bias # - Which of the four managers who marketed to the four segments # performed best in the subsequent 12 months? # => "... to the ..." ################################################################ # # * SIMULATED DATA: a demonstration of the power of mosaic plots # applied to discretized continuous variables: # # We construct a pair of normally distributed data # that have a very weak (true) correlation, 0.05: for(i in 1:100) { cor.xy <- .05; N <- 10000 x <- rnorm(N) y <- cor.xy * x + sqrt(1-cor.xy^2) * rnorm(N) # [Justification of this construction: # If X and V are independent standard normal, # then Y = c*X + sqrt(1-c^2)*V is also standard normal # and X and Y have correlation c. # This remark is for students with sufficient stats background.] # # Of course the data we generated do not have a correlation # of exactly 0.05: cor(x,y) # # Next discretize x and y: x.categ <- cut(x, breaks=c(-Inf,quantile(x,c(1/3,2/3)),Inf), labels=c("low.x","mid.x","high.x")) y.categ <- cut(y, breaks=c(-Inf,quantile(y,c(1/3,2/3)),Inf), labels=c("low.y","mid.y","high.y")) # # Finally compare a scatterplot of y versus x with a mosaic plot # of y.categ versus x.categ: # windows(15,7.5) # windows(w=16, h=8) par(mfrow=c(1,2)) plot(x,y, pch=16, cex=.2, xlim=c(-4,4), ylim=c(-4,4)) mosaic.plot(table(x.categ, y.categ), lab.dist.hor=.2, lab.dist.vert=.2, cex=.8) # Can you see the correlation in the scatterplot? The mosaic plot? # [If the trend in the mosaic plot seems wrong, think about it. # It does reflect a positive correlation.] # # [ # Animation: # You can wrap a for-loop around the statements of this section # and execute them a 100 or so times. # # ... dev.flush(); Sys.sleep(.5) } # But comment out any 'windows(...)' statement to avoid creating # 100 windows!!! This will make sure that we re-use the existing # one instead. # This loop will create an animation that shows you the # # SAMPLING VARIABILITY # or # DATASET-TO-DATASET VARIABILITY # # of standard normal data of size N and with true correlation 0.05. # You can now ask how often you see a correlation in the scatterplot # and how often a corresponding trend in the mosaic plot. # For the overwhelming majority of samples you will # NOT SEE the correlation in the scatterplot, but you will # SEE it in the mosaic plot. # ] # Raising the question of sampling variability brings us # to problems of statistical inference: # # IS WHAT WE SEE REAL? # # Here is an inferential version that provides an independence test: mosaic.plot.overallnull(data.frame(x=x.categ, y=y.categ), nrand=10, lab.dist.hor=.2, lab.dist.vert=.2, cex=.8) # Its not very successful, though. #---------------------------------------------------------------- # INTERACTIVE DATA VISUALIZATION SOFTWARE: # - Some commercial software packages contain such features: # . JMP (from SAS, John Sall''s project) # . Datadesk (Velleman) # - Fundamental operations: # . painting/brushing in linked plots # . dynamic projections, in particular 3D rotations, # but also grand tours, correlation tours, regression tours # . scaling # . others: subsampling, transforming, ... # - The modern challenge: MANY VARIABLES (large p versus large N) # . Example: Association Navigator # - Other areas for dynamic visualization: # . maps # . analytic graphs # - What does it take to program your own dynamic viz? # # . R: Simple identifications can be done with 'identify()' cars <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/CarModels2003-4-NA.TXT", header=T, sep=",", na=".") # Use instrustor's fct for more space between borders and points: plot.plus(cars[,c("Horsepower","MPG.Highway")], extra=.2) identify(cars[,c("Horsepower","MPG.Highway")], labels=cars[,"Model"]) # Hit ESC to stop # Another function returns coordinates at click location, NOT neares point: for(i in 1:5) { xy <- locator(n=1); points(xy, pch=16, cex=2, col="red") } # Hit ESC to stop sooner for(i in 1:5) { xy <- locator(n=1); abline(v=xy$x, col="red"); abline(h=xy$y, col="red") } # # . Need also 'widgets', provided in R by libraries 'tcltk2', 'RGtk2' pckgs.all <- available.packages("http://cran.r-project.org/bin/windows/contrib/3.2") # update version 3.2! str(pckgs.all) # 2012/09/11: 3956 packages; 2014/09/10: 5698 grep("tk", pckgs.all[,1], val=T) # Prints packages related to graphics toolkits. # # . R: getGraphicsEvent() # Provides direct access to mouse clicks and mouse moves # and dispatch to user-programmed functions on these events. # (See the instructor's demo of two programs for dynamic visualization # - of commercial tuna fishing vessels in the eastern Pacific Ocean, and # - of large correlation tables of size up to 1000x1000. # To learn about this function, see help(getGraphicsEvent) # Here is a simple non-statistical demo of the getGraphicsEvent() function: source("http://stat.wharton.upenn.edu/~buja/STAT-926/etch-a-sketch.R") # Examine the code and cannibalize it to do something useful. # # . A program with a fixed set interactive dynamic capabilities: # ggobi from http://www.ggobi.org/ # # . For truly dynamic graphics with web capability learn javascript (which is not java). #---------------------------------------------------------------- # DATA VISUALIZATION AND STATISTICAL INFERENCE: # - Add # . confidence intervals/regions/bands or # . null intervals/regions/bands # to your plots whenever possible and meaninful. # - General principles that often apply (recall Stat 541) # . for confidence: ... # . for null: ... # - Issue: Simultaneity # - Examples: # . normal quantile plots # . empirical quantile plots # #---------------- # - EXAMPLE 1: Normal quantile plots with null and confidence bands places <- read.table("http://www-stat.wharton.upenn.edu/~buja/STAT-541/places.dat") places <- read.table("../A-STAT-541/Data/places.dat") str(places) # Housing costs: qqnorm(places[,"Housing"]) source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/qqplot.R") source("../A-STAT-541/Functions/qqplot.R") qqnorm.null.conf(places[,"Housing"]) # Some fun: search Box-Cox transformations to achieve normality by the lights # the null bands and CI bands: qqnorm.null.conf(places[,"Housing"], box.cox.par=0.5, new.window=F) qqnorm.null.conf(places[,"Housing"], box.cox.par=0, new.window=F) qqnorm.null.conf(places[,"Housing"], box.cox.par=-1, new.window=F) qqnorm.null.conf(places[,"Housing"], box.cox.par=-1.5, new.window=F) # - Problem with inference for quantiles: # We don't know the location and scale parameters: # y = mu + sig*rnorm where mu = E[y], sig=SD[y] # mu = intercept, sig = slope of line if y ~ N(mu,sig^2). # They are NOT of interest, i.e., we are not testing them. # => mu, sig are nuisance parameters. # Q: What is of interest in this context? # A: The SHAPE of the distribution, not its LOCATION and SCALE !!! # - General principle for eliminating nuisance parameters: # Find PIVOTAL test statistics. # DEF: A test statistic is PIVOTAL if its NULL DISTRIBUTION # does not depend on the nuisance parameters. # Example: The t-statistic for H0: mu=0 is pivotal wrt # the nuisance parameters sig in N(mu,sig^2). # - Q: How can we make the normal quantile plot 'pivotal' # wrt to the nuisance parameters mu and sig if # H0: y ~ N(mu,sig), and the alternative is y !~ N(,) ? # Note that this is a nonparametric alternative! # Rejection is desired for any other distributional shape. # # A: Transform the empirical quantiles (sorted values) y(i) # by standardizing them: (y(i) - mean(y))/sd(y) # Make up other examples of standardization! # # - Correct testing procedure: Generate NULL SAMPLES # by sampling ynull ~ rnorm(N,mu=0,s=1) # and form the pivotal sorted values: # (ynull(i) - mean(ynull))/sd(ynull) # ==> Overplot many NULL SAMPLES (so-called 'spaghetti plot') # in pivotized form. # Overlay the OBSERVED SAMPLE in pivotized form. # ==> Reject if 'observed' reaches outside the 'null band'. # Check the null band code of the instructor's function: qqnorm.null.conf # - Confidence bands follow a different logic. # For iid data, the nonparametric bootstrap (BS) is often applicable. # Here the need for pivotal transformation is less, # because bootstrap resampling (NOT null sampling) # requires no knowledge of the nuisance parameters. # Intuitive use of a BS band: # If no straight line fits inside the band, # reject the null hypothesis of a normal distribution. # However, for an asymptotically valid test, you should # also use pivotized data. # Reason: In pivotized form you know WHICH LINE # the confidence band should capture -- # the vert=hor 45 degreed line. #---------------- # - EXAMPLE 2: Two-sample comparisons with permutation null bands # Mktg data -- compare 'rev' for the 4 segments # This is an alterantive to HW 1 where we use density comparison. # Here we use an emprirical QQ-plot for group comparison (x) # wrt to a quantitative variable (y). windows(9,9) x <- as.character(mktg.df[,'seg']) # factor = pain in the ... y <- mktg.df[,'rev'] grps <- su(x) # 4 segments ngrps <- len(grps) par(mfrow=c(ngrps, ngrps), mgp=c(1.5,.5,0), mar=rep(0,4)) for(ig in grps) for(jg in grps) { if(ig==jg) { hist(y[x==ig], breaks=20, col="gray", xlab="", ylab="", main="", xaxt="n", yaxt="n", xlim=range(y)) text(x=par()$usr[1], y=par()$usr[4], lab=ig, adj=c(-.1,1.1), cex=1.2) } else { qqplot.null(y[x==jg], y[x==ig], xlab="", ylab="", xaxt="n", yaxt="n", mgp=c(1.5,.5,0), mar=rep(0,4), nrep=1000) } } # # [Another data example: Autism and fathers' ages] # Q: Why do we NOT have a pivotality problem here? # Or do we? #---------------- # - Issues with bands: # . Spaghetti method is easy, universal, but ... # . Convert the spaghetti bands into actual bands # with controlled simultaneous null coverage 1-alpha # along the lines of Homework 1 for densities. # -------------- End of Data Viz ------------------- #================================================================ NEW CHAPTER: CLASSICAL EIGENVALUE-BASED MULTIVARIATE ANALYSIS * ROADMAP: - Review of principal component analysis (PCA) - Canonical correlation analysis - Generalized canonical analysis - Application of the above to Fisher''s optimal scoring and correspondence analysis #---------------------------------------------------------------- * PRINCIPLE OF MULTIVARIATE ANALYSIS (MVA): - Putting it negatively: Do not single out a response variable (that would be regression) - Putting it positively: Perform symmetric analyses of the variables - Goal: Dimension reduction, capturing many dimensions in few ================================================================ * PRINCIPAL COMPONENT ANALYSIS -- PCA ('review') - Setup: [We use pseudo-R like c(), cbind(), rbind()] . Population: Random variables Xj collected in a random vector: X.vec = (X1,...,Xp)^T (column vector px1) . Multivariate data in columns xj collected in data matrix: X.mat = cbind(x1,...,xp) = rbind(x.row1,...,x.rowN) (Nxp) X.mat <- as.matrix(mktg[,1:8]) # R; toy data from data frame 'mktg' - Connection: . We think of the rows x.rowi of X.mat as iid samples from the random vector X.vec. . The data matrix X.mat is used to estimate parameters of the distribution of X.vec. [We write a row x.rowi as a px1 COLUMN to correspond to X.vec.] [Our toy data are actually the result of sampling, unlike the Boston Housing data.] - First and second order properties: . Population parameters: mu.vec := E[X.vec] (px1) Sigma := V[X.vec] = E[(X.vec - mu.vec)(X.vec - mu.vec)^T] (pxp) Formula reminder: Sigma = E[X.vec X.vec^T] - E[X.vec] E[X.vec]^T . Sample estimates: mu.est <- colMeans(X.mat) # R Sig.est <- var(X.mat) # R Centered matrix: X.mat.c <- scale(X.mat, center=T, scale=F) - What object is being optimized in PCA, and what is the criterion? . The object: linear combinations of the variables a^T X.vec in the population X.mat %*% a in the data (R) . The criterion: variance of the linear combination V[a^T X.vec] in the population var(X.mat %*% a) in the data (R) - What is the variance of a linear combination? . Population: V[ a^T X.vec ] = a^T V[X.vec] a = a^T Sigma a (butterfly formula) . Estimated from data: a <- c(1,1,1,2,2,2,1,1) # R; an arbitrary coefficient vector for a lin comb... var(X.mat %*% a) # R t(a) %*% Sig.est %*% a # R - Constructing the PCA problem: . What is the intuition behind PCA in terms of the above concepts? ... Large variance direction have a chance of showing interesting structure. . What is the problem with this intuition? ... max Var(lincomb) = Inf . What is the conventional solution? ... Restrict the coefficient vector 'a' somehow. |a|^2 = 1 . What is the problem with the conventional solution? ... Apples and oranges wrt units (except in functional data...) . What is it that is conventionally done to 'solve' this problem? ... One scales the variables to unit variance (sd) . What does this imply about the covariance matrix that is actually used? ... Correlation rather than covariance . Think of contexts in which this problem does not arise, that is, in which the plain covariance matrix can be used. ... Functional data where all variables have the same units . Comments: + You will find some authors argue against the use of the correlation matrix in PCA on grounds of difficult or disadvantageous asymptotics. Ask them how they would solve the apples-and-oranges problem when the variables have heterogeneous units. + Compare: In regression, explanatory power of a predictor s measured by a t-statistic, which is free of units. In PCA we have no such tool and must decide a priori how to make the units of the variables comparable. Choosing the units to be the standard deviation across all variables, i.e., z-scoring/standardizing the variables, seems commonsensical. + Note that there are many ways to standardize. For example, one might choose median and MAD instead of mean and sd if robustness to heavy tails is an issue. But then you might also revamp the PCA criterion by optimizing a robust variance rather than the classical variance. However, you will no longer be performing eigenanalysis but successive extraction of eigenvectors with actual optimization. + Yet another possibility is nonlinear transformation of the variables so as to tame skewness and/or kurtosis. - Reformulate the PCA problem as an eigenvalue/vector problem: . Population: ... . Data: ... - Perform the PCA eigenanalysis for the mktg data in X.mat: + Two ways to get the correlation matrix: X.std <- scale(X.mat) # standardize to mean=0, sd=1 Sig.est <- var(X.std) # now this is the correlation matrix Sig.est <- cor(X.mat) # same as previous two lines: cor() rather than var(scale()) !! + Eigenanalysis of the correlation matrix: Sig.est.eig <- eigen(Sig.est) str(Sig.est.eig) # components: ...$vectors, ...$values + Check reconstruction of Sig.est as V D V^T: max(abs( Sig.est - Sig.est.eig$vec %*% diag(Sig.est.eig$val) %*% t(Sig.est.eig$vec) )) - Interpretation of eigenvalues as variances: . Population: let v.j be the j''th eigenvector of Sigma Sigma v.j = lambda v.j v.j^T Sigma v.j = lambda |v.j|^2 = lambda.j (j''th eigenvalue) v.j^T Sigma v.j = V[v.j^T X.vec] ==> lambda.j = V[v.j^T X.vec] . Data: dito with Sig.est instead of Sigma Check the interpretation numerically: apply(X.std %*% Sig.est.eig$vec, 2, var) # var() of lin combs from e'vecs Sig.est.eig$val # eigenvalues ==> We call the PCA eigenvalues also 'PC variances'. Their roots are also called 'PC standard deviations' or 'PC sdevs'. . Reminder of variance 'budgets': tr <- function(mat) sum(diag(mat)) tr(Sig.est) # trace of covariance/correlation matrix sum(Sig.est.eig$val) # sum of eigenvalues Trace is invariant under basis change, hence sum of eigenvalues == sum of variable variances ==> If PCA is correlation-based (equiv: X.mat is standardized), then the eigenvalues sum up to p, the number of variables. - PC scores: Projecting the cases onto the PC dimensions "Linear Dimension Reduction" . Recall: The cases correspond to the rows in X.std. . The projection score of the i''th case x = X.std[i,] (row i) on a unit vector (direction) v is = x^T v. [Mathematically, a projection is IR^p --> IR^p, so a p-vector x projects to the p-vector (v v^T) x = v (v^T x), because v v^T is the rank-one or one-dimensional projection onto the direction v. What matters, of course, is the 'projection score' (v^T x).] . If the projection is onto the PC eigenvectors, the projection scores are called 'PC scores'. Example: case 5 projected onto the first eigen direction i <- 5; j <- 1 # case 5, PC 1 t(X.std[i,]) %*% Sig.est.eig$vec[,j] # x^T v sum( X.std[i,] * Sig.est.eig$vec[,j] ) # same . The N-vector of scores for the j''th PC direction is: X.std %*% Sig.est.eig$vec[,j] # all inner products ... The matrix of PC scores for all PC directions is: X.PCscores <- X.std %*% Sig.est.eig$vec Its size is ... dim(X.PCscores) . From the PC scores you can easily reconstruct the (standardized) data: max(abs( X.std - X.PCscores %*% t(Sig.est.eig$vec) )) The reason is that t(Sig.est.eig$vec) is the inverse of Sig.est.eig$vec because this is an orthogonal matrix. . Interpretation of PC scores: + re-expression of the (standardized) data matrix X.std in a new coordinate system where basis vectors = PC eigenvectors + The basis change creates new variables (the PC scores) that are 'DECORRELATED': round(var(X.PCscores), 5) # Off-diagonal==0 Sig.est.eig$val # Diagonal and centered if X.std was centered: colMeans(X.PCscores) + The variances of the PC score variables are the PC eigenvalues. [This better be so because we showed it earlier: The PC scores are the optimized linear combinations of the original variables.] + PC scores lend themselves for plotting: Think of the data cloud in p-space as forming a 'pizza'. plot(X.PCscores[,1:2]) # What's wrong with this? Scale!! lims <- c(-5,5) plot(X.PCscores[,1:2], xlim=lims, ylim=lims) abline(h=0); abline(v=0) ==> View of the 'pizza' from above. It is an oval pizza. Q: Why is scale of the essence? A: Projections have their individual variances which should be reflected by the plot. Here is another view: plot(X.PCscores[,c(1,ncol(X.PCscores))], xlim=lims, ylim=lims) ==> View of the 'pizza' edge-on. Here is the full view: pairs.plus(X.PCscores, pch=".") . How many points are plotted? ==> Don''t be surprised if reshaping the plot is slow. . How do you explain the striations? . What''s wrong with the scales? pairs.plus(X.PCscores, pch=".", xlim=lims, ylim=lims) ==> Proper size of the 8-dim point clould in the PC directions. . Do you perceive correlations in any of the plots? ==> If so, it''s a visual illusion. We know there is no correlation: round(cor(X.PCscores), dig=7) - Properties of PC scores: . Population version of PC scores: For correlation-based PCA, use standardized variables: Xvec.std = (Xvec - E[Xvec])/SD[Xvec] # SD and division componentwise Sigma = E[ Xvec.std %*% Xvec.std^T ] # Population correlation matrix Evals (p) in pxp diagonal Lambda, Evecs in pxp o.n. V: Sigma = V Lambda V^T PC score variables: random vector Xvec.PCscores = V^T Xvec.std (px1) 1st order properties: E[Xvec.PCscores] = 0 if E[X.std] = 0 (px1) 2nd order properties: V[Xvec.PCscores] = V[V^ Xvec.std] = V^ V[Xvec.std] V = V^T Sigma V = Sigma ==> diagonal, hence decorrelated. . The same holds in samples with mean() instead of E[] and cov() instead of V[]: X.PCscores = X.std V.est where V.est = estimated matrix of eigenvectors (Sig.est.eig$vec). Note the transposition compared to the population version because the rows of X.std corresponds to the column Xvec.std. + 1st order properties: If the columns of X.std have zero means, so do the columns of the linear transformation X.std V.est. + 2nd order properties: cov( X.PCscores ) = X.PCscores^T X.PCscores / (N-1) = V.est^T (X.std^T X.std/(N-1)) V.est = V.est^T Sig.est V.est = diagonal matrix of estimated eigenvalues ==> decorrelated columns in X.PCscores . Under the hood: Eigendecompositions diagonalize simultaneously the quadratic forms of the Rayleigh numerator and denominator. ==> PC eigenvectors vj are simultaneously orthogonal wrt to + the covariance: vj^T Sigma vk = 0 for j!=k + the Euclidean squared distance: vj^T vk = 0 for j!=k - Casting the PCA problem for data as an SVD problem: . Reminder: the 'singular value decomposition' or 'SVD' If X is Nxp full rank, then there exist U, D, V such that X = U D V^T U is Nxp columns = left singular vectors V is pxp columns = right singular vectors U^T U = I (pxp) (U has o.n. columns) V^T V = I (pxp) (V has o.n. columns) D diagonal (pxp) . SVD as optimization/stationarity of a bi-Rayleigh quotient: (u^T X v)^2 R(u,v) = ----------- (or its root) |u|^2 |v|^2 Write down the stationarity conditions: X (v/|v|) = R(u,v)^(1/2) u/|u| As for eigen decomps, there exists a whole o.n. basis of such u and v, which we can collect in matrices U and V (U^T U = I_p = V^T V), so that: X V = U D, where D is diagonal with values R(u,v) in it. X = U D V^T (QED) . For correlation-based PCA, standardize X.mat first: X.std <- scale(X.mat) # did this earlier already, just a reminder For covariance-based PCA, center X.mat first: X.std <- scale(X.mat, scale=F) . Perform an SVD of X.std: X.svd <- svd(X.std) str(X.svd) # components ...$d, ...$u, ...$v Check the reconstruction of X.std: max(abs( X.std - X.svd$u %*% diag(X.svd$d) %*% t(X.svd$v) )) . The connection with PCA is obtained by recalling the correlation matrix: max(abs( Sig.est - t(X.std) %*% X.std / (nrow(X.std)-1) )) Hence, in math rather than R notation: Sig.est = X.std^T X.std / (N-1) = V D U^T U D V^T / (N-1) = V D^2 V^T / (N-1) ==> D^2/(N-1) are the eigenvalues or PC variances. D/sqrt(N-1) are the PC sdevs. V from svd(X.std) contains the PC eigenvectors. . Numeric check of agreement between svd(X.std) and eigen(Sig.est): Sig.est.eig$val; X.svd$d^2 / (nrow(X.std)-1) # eigenvalue agreement max(abs( Sig.est.eig$vec - X.svd$v )) # eigenvector agreement -- oops!?!? plot(Sig.est.eig$vec, X.svd$v, pch=16) ==> Eigenvectors and singular vectors are determined only up to a sign change! Here is a better test (try to understand it): round(t(Sig.est.eig$vec) %*% X.svd$v, 13) . PC scores from the svd: X.std = U D V^T where V = eigenvectors X.std V = U D ==> U D is the PC score matrix. Check agreement with previous scores, but beware of sign flips: plot(X.svd$u %*% diag(X.svd$d), X.PCscores, pch=16) abline(a=0, b=1, col="red"); abline(a=0, b=-1, col="red"); - PC loadings: PC eigenvectors rescaled . In principle, the meaning of a PC can be recognized by comparing the entries of its eigenvector: an entry large in magnitude implies that this variables 'loads' strongly on this PC. . Interpretability of eigenvectors can be enhanced by rescaling them: X.std = U D V^T U D = PC score matrix U = PC score matrix with columns scaled to unit length sqrt(N-1)*U = PC score matrix with columns scaled to unit sdev ==> Correlation between X.std and PC scores can be obtained by cor(X.PCscores, X.std) = sqrt(N-1) * U^T X.std / (N-1) = U^T X.std / sqrt(N-1) = D V^T / sqrt(N-1) cor(X.std, X.PCscores) = (D V^T)^T / sqrt(N-1) = V D / sqrt(N-1) Recall that D/sqrt(N-1) are the PC sdevs. Hence V D / sqrt(N-1) are the eigenvectors rescaled by the PC sdevs. They are identical with the correlations between PC scores and the (stdized) data. - R functions: . Canned: prcomp(X.mat) # defaults are center=T, scale=F summary(prcomp(X.mat)) princomp(X.mat) # default: cov-based, not cor-based summary(princomp(X.mat)) . From scratch: much work for good labeling pca <- function(X, center=T, scale=T) { N <- nrow(X) p <- ncol(X) X.std <- scale(X, center=center, scale=scale) X.svd <- svd(X.std) # Eigenvalues and their siblings: evals.out <- rbind("variance" = X.svd$d^2 / (N-1), "sdev" = X.svd$d / sqrt(N-1), "% variance" = X.svd$d^2 / sum(X.svd$d^2) * 100, "% cumulative" = cumsum(X.svd$d^2 / sum(X.svd$d^2)) * 100, "% cum. wrt rest"= X.svd$d^2 / rev(cumsum(rev(X.svd$d^2))) * 100 ) colnames(evals.out) <- paste("PC",1:p,sep="") # Eigenvectors: evecs.out <- X.svd$v colnames(evecs.out) <- paste("PC",1:p,sep="") rownames(evecs.out) <- colnames(X) # Loadings: loads.out <- evecs.out %*% diag(evals.out["sdev",]) colnames(loads.out) <- paste("PC",1:p,sep="") # Scores: scores.out <- X.svd$u %*% diag(X.svd$d) colnames(scores.out) <- paste("PC",1:p,sep="") rownames(scores.out) <- rownames(X) # Return the goods: list(values=evals.out, vectors=evecs.out, loadings=loads.out, scores=scores.out ) } X.pca <- pca(X.mat) # Example - Worked example: . PC eigenvalues: round(X.pca$val, 2) ## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 ## variance 2.64 1.69 1.03 0.81 0.67 0.53 0.42 0.20 ## sdev 1.62 1.30 1.02 0.90 0.82 0.73 0.65 0.45 ## % variance 33.00 21.11 12.93 10.15 8.41 6.59 5.27 2.54 ## % cumulative 33.00 54.11 67.04 77.19 85.60 92.19 97.46 100.00 ## % cum. wrt rest 33.00 31.51 28.17 30.80 36.86 45.77 67.49 100.00 Trace plot: plot(X.pca$val["sdev",], pch=16, type="b", ylim=c(0,max(X.pca$val["sdev",])), ylab="PC sdev", xlab="PC") abline(h=c(0,1), col="blue") . PC loadings for meaning: round(X.pca$load,3) ## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 ## in. 0.713 -0.429 0.090 -0.003 0.133 0.162 -0.503 -0.053 ## out 0.850 -0.337 0.102 0.001 0.020 -0.084 0.179 0.337 ## rev 0.812 -0.378 0.076 -0.053 -0.072 -0.120 0.291 -0.292 ## biz 0.401 0.404 -0.457 0.497 0.451 -0.125 0.019 -0.028 ## reach 0.473 0.482 0.039 0.424 -0.574 0.180 -0.043 0.002 ## kids 0.364 0.628 0.342 -0.261 0.296 0.426 0.136 -0.014 ## age -0.378 -0.672 -0.184 0.330 0.060 0.478 0.175 0.004 ## edu 0.301 0.088 -0.805 -0.454 -0.161 0.147 -0.001 0.018 Note: - read a story from PC1, PC2, PC8 - note the 'importance weighting' of the loadings according to PC sdevs - note the sign patterns imposed by orthogonalities - typical: the first PC is a weighted average of positively associated variables Profile plots of loadings: parallel coordinate plots p <- ncol(X.pca$load) windows(h=5, w=10) parcoord(cbind(p:1,t(X.pca$load)), lwd=3, ## col=paste("gray",round(seq(10,90,len=ncol(X.pca$load))),sep="") ) col=sample(colors()[25:60],size=p)) mtext(text=colnames(X.pca$load), side=2, at=seq(1,0,len=p), las=2, adj=0) abline(h=0, col="gray", lty=2) ==> Problem is that all axes are independently scaled. Hand-made parallel coordinate traces with identical scales: plot(c(-.5,p), c(-1,1), type="n", ylab="Loadings", xlab="Variables", xaxt="n") axis(side=1, at=0:p, lab=c("PC Order",rownames(X.pca$load)), las=2) text(lab=colnames(X.pca$load), y=seq(1,-1,len=p), x=0, adj=1.1, cex=.8) abline(h=0, col="gray") clrs <- colors()[c(70,76,91,97,119,134,254,461,653)] # Some strong colors for(j in 1:p) lines(x=0:p, rbind(seq(1,-1,len=p),X.pca$load)[,j], lwd=3, col=clrs[j]) . PC scores, plotted: We use segmentation for coloring windows() # New square window rg <- c(-1,1)*max(abs(X.pca$sco)) # Impose same ranges on all plots in the PC score splom pairs.plus(X.pca$sco, xlim=rg, ylim=rg, cex=.1, col=c("red","blue","green","brown")[mktg[,"seg"]]) # color by segment Focus on first (dimension reduction to 2 dims): plot.plus(X.pca$sco[,1:2], xlim=rg, ylim=rg, cex=1, # Just the first two PCs col=c("red","blue","green","brown")[mktg[,"seg"]]) Original standardized data, plotted: windows() rg <- c(-1,1)*max(abs(X.std)) # Impose same ranges on all plots in the PC score splom pairs.plus(X.std, xlim=rg, ylim=rg, cex=.1, col=c("red","blue","green","brown")[mktg[,"seg"]]) # color by segment . Compare: prcomp(X.mat, scale=T) princomp(X.mat, cor=T) summary(princomp(X.mat, cor=T)) - Inference for PCA: . Try the universal hammers: + permutation tests + nonparametric bootstrap But will they work? . Permutation testing: + H0: All variables are independent (only game for permutation approach) Q: Is global independence a meaningful null hypothesis? ... + Test statistics: estimated eigenvalues Try the LARGEST eigenvalue as test statistic. Q: Valid test of independence? A: ... + Try the SECOND LARGEST eigenvalue as test statistic. What are H0 and H1 now? (Recall: test stat <--> H1) ... What would the intuitively desired H0 and H1 be? ... Intuitively one would like to test whether the 2nd eval is significant -- never mind what the 1st was. However: If the 1st eval was powerful, the 2nd eval has a lesser chance due to the budget constraint: sum evals = p. + One is tempted to look for a conditional test for lambda2: H0: "Conditional on lambda1.est, the variables are independent." But the conditional independence distribution of X.mat given lambda1.est is not a permutation distribution! How would you sample from a conditional distribution of X.mat given lambda1.est? ... + Another idea: Consider lambda1 as a nuisance parameter when testing lambda2. ==> Try to pivotize lambda1 out of lambda2.est. Any proposals? ... How about this: lambda2.est as proportion of remaining variance lambda2.est / sum_{j=2...p} lambdaj.est = lambda2.est / (p-lambda1.est) # correlation-based PCA But is the distribution of this statistic free of lambda1? ... Should it be conditional on the first eigenvector v1.est as well? ... Would it work at least for multivariate normal Xvec? ... How would you 'pull out' the first component? ... ('residualizing') max(abs(X.std - X.svd$u %*% diag(X.svd$d) %*% t(X.svd$v))) # Ok, we understand the decomposition of X.std. X.adj1 <- X.std - X.svd$u[,1] %*% t(X.svd$v[,1]) * X.svd$d[1] # Should we use correlation-based or covariance-based PCA on X.adj1? # Correlation-based would restandardize the adjusted variables. # Maybe we don't want this because it changes the eigenstructure, # whereas we want to continue testing the previous eigenstructure. X.adj1.pca <- pca(X.adj1, scale=F) rbind(X.adj1.pca$val, X.pca$val) # Good, we seem to have removed PC1 from X.pca and replaced it with a zero PC. # Next question: Is it possible that X.adj1 has independent columns? # Ooops, we introduced an eigenvalue zero, which violates the assumption # of independence! Here is another idea: ## If random permutations do not work, how about random rotations? ## The point is that we can constrain the random rotations to the ## space orthogonal to the first eigenvector (and in generality to ## spaces orthogonal to any sequence or leading eigenvectors). ## The null hypothesis is therefore not independence but ## rotational invariance of the remaining variability. ## Multivariate normal distributions would have this property ## of rotational invariance in subspaces of identical eigenvalues. ## The test would still not be exact because it is orthogonal to ## estimated eivenvectors, not the true eigenvectors. ## Hence we still need N large compared to p. + Social science interpretation of the PC testing problem: Only correlation-based PCA is used. Of interest is not H1: lambda2 > lambda3, but H1: lambda2 > 1. Reason: In your typical questionnaire dataset, a PC that has variance<=1 does not capture more variation than one variable/item (variance of a colum = 1 due to standardization). ==> Testing H0: lambda2=1 is reasonable. ==> Permutation testing of lambda2.est does approximately the right thing. (There is still the issue that a large lambda1 depresses lambda2.) ==> Augment the eigenvalue profile with a permutation null band. plot(X.pca$val["sdev",], type="n", ylim=c(0,max(X.pca$val["sdev",])), ylab="PC sdev", xlab="PC") # roots of eigenvalues for(j in 1:100) lines(sqrt(eigen(cor(apply(X.std,2,sample)))$val), col="gray") abline(h=c(0,1), col="blue") points(X.pca$val["sdev",], pch=16, type="b") Q: Is lambda3>1 significant? ... + If conditional null hypotheses do not work, what about changing the statistic for power against desirable alternatives? We may ask whether lambda2 is large compared to sum{j>=2} lambdaj. ==> Use percentage of variance wrt to remaining variance, as shown in row 5 of round(X.pca$val,2): ## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 ## variance 2.64 1.69 1.03 0.81 0.67 0.53 0.42 0.20 ## sdev 1.62 1.30 1.02 0.90 0.82 0.73 0.65 0.45 ## % variance 33.00 21.11 12.93 10.15 8.41 6.59 5.27 2.54 ## % cumulative 33.00 54.11 67.04 77.19 85.60 92.19 97.46 100.00 ## % cum. wrt rest 33.00 31.51 28.17 30.80 36.86 45.77 67.49 100.00 plot(X.pca$val[5,], type="n", ylim=c(0,max(X.pca$val[5,])), ylab="% var wrt rest", xlab="PC") # roots of eigenvalues for(j in 1:100) lines(pca(apply(X.std,2,sample))$val[5,], col="gray") lines(1/(p:1)*100, col="blue") # True profile under independence points(X.pca$val[5,], pch=16, type="b") ==> The eigenvalues are uniformly and significantly steeper than under independence. . Nonparametric confidence intervals/bands: + What could be simpler than nonparametric bootstrap, right? ==> obtain SEs for all eigenvalues PS: To make the inference problem harder, we reduce the sample size to 50. There is almost no inference problem for the full N=1926, so let''s select a small subset, more typical of social siences: N <- 30 X <- mktg[sample(nrow(mktg),size=N),1:8] X.pca <- pca(X) Nboot <- 10000 X.evals.bs <- replicate(Nboot, eigen(cor(X[sample(nrow(X),replace=T),]))$val) dim(X.evals.bs) rownames(X.evals.bs) <- paste("PC",1:nrow(X.evals.bs),sep="") str(X.evals.bs) X.evals.bs.SE <- apply(X.evals.bs, 1, sd) # SEs !!!! X.evals.bs.SE + Decorate/enhance the profile plot: plot(X.pca$val["variance",], type="b", ylim=c(0,max(X.pca$val["variance",])), ylab="PC Variance", xlab="PC", pch=16, cex=1.5, lwd=2) # roots of evals abline(h=c(0,1), col="blue") lines(c(rbind(1:p,1:p,NA)), c(rbind(X.pca$val["variance",]+2*X.evals.bs.SE, X.pca$val["variance",]-2*X.evals.bs.SE, NA)), lwd=2, col="red") + Q: Do you think it likely that these CIs cover the true eigenvalues? Objection: Eigenvalue estimates are biased for the true eigenvalues!!! Why? lambda1 = max_a E((a^T Xvec)^2) / |a|^2 (assuming E[Xvec]=0 wlog) Proof? Use E[max_a f(X,a)] >= max_a E[f(X,a)] lambda1.est ~ max_a mean((X.mat a)^2) / |a|^2 E[lambda1.est] ~ E[max_a mean((X.mat a)^2) / |a|^2] >= max_a E[ sum_i (X.mat[i,]^T a)^2 / N / |a|^2 ] = max_a sum_i E[ (X.mat[i,]]^T a)^2 / N / |a|^2 = max_a N E[(X.vec^T a)^2 ] / N / |a|^2 = max_a E[(X.vec^T a)^2 ] / |a|^2 = lambda1 Do you still think lambda1.est +- 2.SE[lambda1.est] covers the true lambda1 for 95% of datasets? + Do you remember from Stat 541 how to obtain bias-corrected BS CIs? ==> BS quantile method !!! Here is the model calculation -- start with a coverage statement as if you knew the true sampling distribution of lambdaj.est: P[ q.lo <= lambdaj.est <= q.hi ] ~ 0.95 P[ q.lo-lambdaj <= lambdaj.est-lambdaj <= q.hi-lambdaj ] ~ 0.95 P[ -c.lo <= lambdaj.est-lambdaj <= c.hi ] ~ 0.95 Invert the coverage statement to a CI statement: invert by multiplying with -1 P[ c.lo >= lambdaj-lambdaj.est >= -c.hi ] ~ 0.95 P[ lambdaj.est+c.lo >= lambdaj >= lambdaj.est-c.hi ] ~ 0.95 Finally reverse order: P[ lambdaj.est-c.hi <= lambdaj <= lambdaj.est+c.lo ] ~ 0.95 + Transfer the above calculation to the bootstrap simulation: BS principle: In a BS simulation, the sample becomes the population. Correspondence: lambdaj <-- lambdaj.est lambdaj.est <-- lambdaj.est* (* = from BS samples) BS Recipe: 1) obtain q.lo* and q.hi* as upper and lower 2.5% quantiles of the BS distribution of lambdaj.est* 2) obtain c.lo* <- lambdaj.est - q.lo* c.hi* <- q.hi - lambdaj.est 3) BS CI: [lambdaj.est - c.hi, lambdaj.est + c.lo] + Simplified formulas: [lambdaj.est - c.hi, lambdaj.est + c.lo] = [2*lambdaj.est - q.hi, 2*lambdaj.est + q.lo] ^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^ characteristic bias correction + R implementation: obtain X.evals.bs as above Nboot <- 10000 X.evals.bs <- replicate(Nboot, eigen(cor(X[sample(nrow(X),replace=T),]))$val) rownames(X.evals.bs) <- paste("PC",1:nrow(X.evals.bs),sep="") dim(X.evals.bs) X.evals.bs.q.lo.hi <- apply(X.evals.bs, 1, quantile, prob=c(.025,.975)) X.evals <- eigen(cor(X))$val # observed e'vals rbind(X.evals.bs.q.lo.hi, X.evals) # compare BS quantiles with observed e'vals Graphical presentation of inverted CIs for bias/skewness correction: plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])), ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2) # roots of eigenvalues abline(h=c(0,1), col="blue") lines(c(rbind(1:p,1:p,NA)), c(rbind(2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[1,], 2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[2,], NA)), lwd=2, col="red") # The following is WRONG: using the BS quantiles as boundaries of CIs plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])), ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2) # roots of eigenvalues abline(h=c(0,1), col="blue") lines(c(rbind(1:p,1:p,NA)), c(rbind(X.evals.bs.q.lo.hi[1,], X.evals.bs.q.lo.hi[2,], NA)), lwd=2, col="brown") # ==> These intervals exacerbate the bias. + Alternative: Perform bias correction, then use symmetric normal intervals +-2SE: X.evals.biascorr <- X.evals - (apply(X.evals.bs, 1, mean) - X.evals) # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ BS bias estimate X.evals.biascorr <- 2*X.evals - apply(X.evals.bs, 1, mean) # same!!! plot(X.pca$val["variance",], type="b", ylim=c(0,.5+max(X.pca$val["variance",])), ylab="PC Variance", xlab="PC", pch=16, cex=2, lwd=2) # roots of eigenvalues abline(h=c(0,1), col="blue") lines(c(rbind(1:p,1:p,NA)), # Show Bias-corrected symmetric CIs in red c(rbind(X.evals.biascorr - 2*X.evals.bs.SE, X.evals.biascorr + 2*X.evals.bs.SE, NA)), lwd=2, col="red") lines(c(rbind(1:p,1:p,NA))+.05, # Quantile-based CIs for comparison in green c(rbind(2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[1,], 2*X.pca$val["variance",]-X.evals.bs.q.lo.hi[2,], NA)), lwd=2, col="green") points(X.evals.biascorr, pch=16, cex=1.5, col="red") # Bias-corrected e'vals title(main="Red: bias-corrected symmetric, Green: quantile inversion") # ==> Very similar intervals + Warning: There is no free lunch. ~ Bootstrap quantile methods are more noisy than symmetric intervals with BS SEs. ~ Bias correction adds noise also. However, the bias problem for large eigenvalues can be such that it must be taken care of. The same may hold for the non-normality of the sampling (dataset-to-dataset) distribution of the eigenvalue estimates. + Do you remember a diagnostic from Stat 541 for 'asymptotic normality'? windows(10,5) par(mfrow=c(2,4), mgp=c(1.8, .5, 0), mar=c(3,3,1,1)) for(j in 1:nrow(X.evals.bs)) { qqnorm(X.evals.bs[j,], main="", xlab="", ylab="", pch=16) abline(a=mean(X.evals.bs[j,]), b=sd(X.evals.bs[j,]), col="green", lwd=2) abline(v=c(-2,+2), col="red", lwd=2) } Depending on the random subsample of 'mktg' you are getting, the deviation from normality can be substantial. + Remaining issue: +-2SE is a pointwise benchmark. ==> Calibrate for simultaneity. Recipe: Examine the 8 intervals as to how many BS profiles they entirely contain. This is simple for the centered SE-based intervals: Do the bean counting for various multiples K*SE. You have to simulate nature, though, so you should check coverage from the mean profile of the eigenvalues. R code: for(K in seq(2,3.5,by=.01)) # A ladder of multiples cat(K,": ", mean(apply(abs(X.evals.bs - apply(X.evals.bs,1,mean)) <= K*X.evals.bs.SE, 2, all)), # We need the inequalities for all 8 e'vals; mean() makes proportions "\n") ==> K~2.76 seems to be required for 0.95 simultaneous coverage. (If you redo the exercise with different subsets of 'mktg', you will get different results. The instructor has seen required multipliers as high as K=3.1.) Compare with Bonferroni, assuming normality: -qnorm(.05/2/length(X.evals)) Funny, the multiplier for simultaneity is almost the same. Do not be surprised, though, if the simulation-based K is larger than the Bonferroni-based K. The simulation adjusts for non-normality, for example, whereas our Bonferroni multiplier is based on a normality assumption. - Outlook on past theory: There is some finite-sample and asymptotic theory for PCA under the assumption of normality. Typically it does not get used. - Outlook on modern theory: Random matrix theory There are some neat results that show how strong 'low-rank signals' should be to be correctly detectable at all. + For PCA: Lu 2002, Baik&Silverstein 2006, Paul (2007), Nadler (2008), ... + For a low-rank-mean+noise model: Shabalin&Nobel (2010) (See a slide from Dan Yang''s 2012 talk.) ------------------------- - Odds and Ends: . Background about eigendecomps: simultaneous diagonalization Given two quadratic forms Q1(x) = x^T S1 x and Q2(x) = x^T S2 x, where S1 and S2 are symmetric matrices, S2 positive definite, there exists a generalized eigendecomp for S1 wrt S2 based on the Rayleigh quotient R(x) = x^T S1 x / x^T S2 x = Q1(x) / Q2(x) The generalized eigendecomp of S1 wrt S2 diagonalizes S1 and S2 simultaneously. . The LS interpretation of the SVD of X (actually: X.std): + Frobenius squared norm of an arbitrary matrix X: |X|^2 = sum_nj X_nj^2 = tr(X X^T) = tr(X^T X) = Euclidean squared norm if X is strung out as a vector + LS criterion for rank-one approximation: |X - d1 U1 V1^T|^2 = min_{d1,U1,V1} ==> svd provides the minimizing d1,U1,V1 ==> Generalizes to arbitrary low-rank approximation. ==> Reduction of svd to two alternating regressions . SVD and Rayleigh quotient of an arbitrary matrix X: + R(U1,V1) = (U1^T X V1) / (|U1| |V1|) + The pairs of singular vectors are the stationary solutions for R(U1,U2). + The largest singular value is d1 = max_{U1,V1} R(U1,V1) + The 2nd largest singular value is d2 = max_{U2,V2} R(U2,V2) subject to U2 _|_ U1, V2 _|_ V1 ... . Power algorithms for eigenvectors of a symmetric matrix S: + Simple power iterations: Initialize V1 randomly !=0, normalize V1 <- V1/|V1| Repeat till convergence: V1 <- S V1 V1 <- V1/|V1| Convergence will be to an eigenvalue for the largest eigenvalue. + 2nd eigenvector: Repeat power algorithm with V2 subject to _|_ V1. ... + Simultaneous power iterations: Initialize V=(V1,V2,...Vq), orthonormalize columns: V^T V = I Repeat till convergence: V <- S V Orthonormalize columns: V^T V = I Convergence will be to a q-tuple of eigenvectors for the largest q eigenvalues. Permitted: q=p, in which case one gets a basis of eigenvectors with eigenvalues sorted in descending magnitude. (Caution for negative eigenvalues: convergence of even or odd iterates.). . Power algorithms for SVDs of an arbitrary matrix X: + Simple power iterations: Initialize V1 randomly !=0, normalize V1 <- V1/|V1| Repeat till convergence: U1 <- S V1 U1 <- U1/|U1| V1 <- S^T U1 V1 <- V1/|V1| Convergence will be to ... + Simultaneous iterations obvious ... ================================================================ * CANONICAL CORRELATION ANALYSIS -- CCA: - Idea of CCA: ... Divide the variables into two blocks, called X, Y: sizes: Nxp, Nxq; this division is driven by subject matter. Ask for linear combinations X a, Y b such that --------------------------------------------- | | | (a,b) = argmax_{a,b != 0} cor(X a, Y b) | X, Y full rank | | --------------------------------------------- - Again, there is a population and a sample/data version: . Population: random vectors Xvec (px1), Yvec (qx1) with a joint distribution P(dx,dy) . Data: X (Nxp), Y (Nxq) = row-matched random matrices . Relation: + The rows Xvecn of X are iid draws from Xvec. + The rows Yvecn of Y are iid draws from Yvec. - Simple data example: 'mktg', volume variables versus needs&demographics X <- as.matrix(mktg[,4:8]) Y <- as.matrix(mktg[,1:3]) round(cor(X,Y), 2) XY.cca <- cancor(X,Y) XY.cca - Terminology: . Stationary solutions of the CCA problem are called 'canonical variates' or CVs . The X part is the 'left CV'; the Y part is the 'right CV'. - Basics: . How many CVs (Canonical Variates) are there for p=5, q=3? ... . Are the CVs uniquely determined when the canonical corrs are different? ... No, any POSITIVE multiple of a and b is also a solution, and simultaneous sign flips in a and b also produces a solution. . Do you expect CV coefficient vectors to be orthogonal to each other? ... 'orthogonal' in the Euclidean sense ... So what are they? ... 'orthogonal = uncorrelated linear combinations' makes sense . Can you immediately tell from the coefficients which variables are strong? ... What could you do for better comparison? ... ==> Standardize before running CCA! X.std <- scale(X) Y.std <- scale(Y) XY.std.cca <- cancor(X.std,Y.std) # Compare canonical correlations: rbind(XY.std.cca$cor, XY.cca$cor) # Why are the above the same? # Compare coefficients: XY.std.cca$xcoef; XY.cca$xcoef # Not the same XY.std.cca$ycoef; XY.cca$ycoef # Nearly the same because # mktg[,1:3] are nearly stdized. Read stories, if any, from these coefficients. XY.std.cca$xcoef; XY.std.cca$ycoef . Doesn''t it feel like cancorr is somewhere between regression and PCA? It is not a popular method, yet one needs to know about it because of other methods that derive from it. - Algebra of CCA: . Optimization problem: cor(X a, Y b) = max/min/stationary_{a,b} ??? cor(X a, Y b) = cov(X a, Y b)/sqrt(var(X a) var(Y b)) # (N-1) drops out = a^T CXY b / sqrt( a^T CXX a b^T CYY b ) where CXY, CXX, CYY are the covariance matrices: assuming X, Y are both centered, we have CXX = X^T X/ (N-1) CYY = Y^T Y/ (N-1) CXY = X^T Y/ (N-1) . Idea: Reduce to Euclidean denominators through 'sphering'/'pre-whitening' of the X and Y blocks. ==> Find roots RX and RY of CXX and CYY. Could be RX = CXX^{1/2}, RY = CYY^{1/2}, but any RX^T RX = CXX, RY^T RY = CYY will do. Can be obtained, e.g., with a Cholesky decomp in R: chol() CXX = RX^T RX CYY = RY^T RY aa = RX a # Coordinate/basis change bb = RY b # Same . Why the terms 'sphering' and 'pre-whitening'? Xpw := X RX^{-1} satisfies (assuming centered X): cov(Xpw) = Xpw^T Xpw / (N-1) = RX^{-T} X^T X RX^{-1} / (N-1) = RX^{-T} CXX RX^{-1} = I (pxp) ==> Xpw has columns that are uncorrelated and have equal variance (=1). ~ 'sphered' or 'spherically symmetric' covariance matrix ~ 'white noise' if Gaussian (data version, though) Dito for Ypw := Y RY^{-1}: cov(Ypw) = I (qxq) . Continue with CCA problem: cor(X a, Y b) = a^T CXY b / sqrt(a^T CXX a * b^T CYY b) = aa^T RX^{-T} CXY RY^{-1} bb / sqrt(aa^T aa bb^T b) = aa^T (RX^{-T} (X^T Y) RY^{-1}) bb / (|aa| |bb|) = max/min/stationary wrt aa and bb ==> Perform an SVD of RX^{-T} CXY RY^{-1}, then back-transform: a <- RX^{-1} aa; b <- RY^{-1} bb . The pretty form of the CCA problem: RX = CXX^{-1/2}, RY = CYY^{-1/2}, both symmetric. Hence: --------------------------------------------- | | | CCA = SVD of CXX^{-1/2} CXY CYY^{-1/2} | | | | up to a coordinate transformation | | | --------------------------------------------- Keep in mind the purpose of the front and back factors: 'sphering', 'pre-whitening' . Another way to think about sphering/pre-whitening: + Orthogonalize the columns of X and Y (separately) with modified Gram-Schmidt, + but use covariance as the inner product. and variance as squared length/norm. . Spatial/geometric interpretation: + Form the column spaces V.X = cspan(Xpw) = cspan(X) V.Y = cspan(Ypw) = cspan(Y) + Find x = Xpw aa in V.X and y = Ypw bb in V.Y that maximize cor(x,y) = cor(x, y) = cov(x, y) / (sd(x) sd(y)) = x^T y / (|x| |y|) (factor sqrt(N-1) cancels) ==> Maximize the cosine between x and y. ==> Minimize the angle between x and y. ------------------------------------------ | | | correlation = special case of cosine | | | ------------------------------------------ ==> Hierarchy of stationary cosines / stationary angles between the two subspaces. General fact: The stationary or principal angles between two subspaces are their orthogonal invariants. That is: Any two pairs of subspaces with the same principal angles can be orthogonally rotated into each other. HW: Show in the hierarchy of CC coeffs that xcoef[,1] _|_ ycoef[,2] in sphered coordinates or in the sense of covariance. . Another way to look at the CCA problem: + Preparation: The Nxp SVD as an (N+p)x(N+p) eigenproblem | I X | |u| |u| | X^T I | |v| = e |v| These eigenequations are the same as: u + X v = e u X^T u + v = e v or: X v = (e-1) u X^T u = (e-1) v i.e., the SVD problem with singular values d = e-1. Consequence: The Rayleigh quotient u^2 + v^2 + 2 u^T X v --------------------- is stationary u^2 + v^2 iff the bi-Rayleigh quotient (u^T X v)^2 ----------- is stationary. |u|^2 |v|^2 The stationary values of the latter are the stationary values of the former - 1. [Cute problem: Show u^2 = v^2 for the eigenvectors (u^T,v^T)^T of the eigenproblem.] + Applied to the sphered CCA problem: aa^T CXX^{-1/2} CXY CYY^{-1/2} bb --------------------------------- is stationary |aa| |bb| iff aa^2 + bb^2 + 2 aa^T CXX^{-1/2} CXY CYY^{-1/2} bb ------------------------------------------------- is stationary. aa^2 + bb^2 + Meaning of this Rayleigh quotient: do back translation a = CXX^{-1/2} aa b = CYY^{-1/2} bb aa = CXX^{1/2} a bb = CYY^{1/2} b aa^2 = a CXX a = V[X a] bb^2 = b CYY b = V[Y b] aa^T CXX^{-1/2} CXY CYY^{-1/2} bb = a^T CXY b = V[X a, Y b] Rayleigh quotient: V[X a] + V[Y b] + 2 V[X a, Y b] V[X a + Y b] ------------------------------- = --------------- V[X a] + V[Y b] V[X a] + V[Y b] Remember the connection to the canonical correlations: canonical correlations = Rayleigh stationary values - 1 . Similarity between regression and CCA: + What happens if you throw highly correlated variables into the X block? (Similar for the Y block.) ... + What is the problem called? ... + What can you do about it? ... subset selection in both blocks; shrink them; block-internal dim reduction + What kind of inference would you like to have for the coefficients? ... t-based inference + What would you miss if you performed inference like in regression? ... + How would you perform such inference anyway? ... - Software: The following is inefficient but pretty code. It uses the symmetric roots CXX^{1/2} and CXX^{-1/2}. cca <- function(X, Y, CV=FALSE, center=TRUE, scale=TRUE) { # CV=TRUE: want canonical variates X <- as.matrix(X) Y <- as.matrix(Y) N <- nrow(X); if(nrow(Y) != N) { cat("X and Y row numbers not the same...\n"); return() } Xs <- scale(X, center=center, scale=scale) Ys <- scale(Y, center=center, scale=scale) CXX <- t(Xs) %*% Xs / (N-1) # pedestrian cov() or cor() CYY <- t(Ys) %*% Ys / (N-1) CXY <- t(Xs) %*% Ys / (N-1) CXX.eig <- eigen(CXX); CXX.eig$values <- pmax(0,CXX.eig$values) CYY.eig <- eigen(CYY); CYY.eig$values <- pmax(0,CYY.eig$values) RX <- CXX.eig$vec %*% diag(CXX.eig$val^{ 1/2}) %*% t(CXX.eig$vec) RX.inv <- CXX.eig$vec %*% diag(CXX.eig$val^{-1/2}) %*% t(CXX.eig$vec) RY <- CYY.eig$vec %*% diag(CYY.eig$val^{ 1/2}) %*% t(CYY.eig$vec) RY.inv <- CYY.eig$vec %*% diag(CYY.eig$val^{-1/2}) %*% t(CYY.eig$vec) CXY.pw <- RX.inv %*% CXY %*% RY.inv CXY.pw.svd <- svd(CXY.pw) CCA <- list(cor = CXY.pw.svd$d, xcoef = RX.inv %*% CXY.pw.svd$u, ycoef = RY.inv %*% CXY.pw.svd$v ) rownames(CCA$xcoef) <- colnames(X) rownames(CCA$ycoef) <- colnames(Y) colnames(CCA$xcoef) <- paste("CV", 1:ncol(CCA$xcoef), sep="") colnames(CCA$ycoef) <- paste("CV", 1:ncol(CCA$ycoef), sep="") names(CCA$cor) <- paste("CV", 1:length(CCA$cor), sep="") if(CV) { CCA$xCV <- Xs %*% CCA$xcoef CCA$yCV <- Ys %*% CCA$ycoef CCA$xscaled <- Xs CCA$yscaled <- Ys } CCA } # Example application: X <- mktg[,4:8] Y <- mktg[,1:3] XY.cca <- cca(X,Y) XY.cca1 <- cancor(scale(X), scale(Y)) XY.cca; XY.cca1 # cancor() has strange scaling of coeffs: round(crossprod(scale(X) %*% XY.cca1$xcoef), 10) # ==> CVs have unit Euclidean length, not unit variance... # Our CVs are sphered, in particular they have unit variance: XY.cca <- cca(X,Y,CV=T) round(cov(XY.cca$xCV), 10) round(cov(XY.cca$yCV), 10) - Plots: . Canonical correlation profile: plot(XY.cca$cor, ylim=c(0,1.2*max(XY.cca$cor)), pch=16, cex=1.5, xaxt="n", xlab="CV Order", ylab="Can Corr") axis(side=1, at=seq(along=XY.cca$cor), lab=names(XY.cca$cor)) lines(XY.cca$cor, lwd=2) abline(h=0, col="blue") . CC coefficient plots: x-coeffs only; should be packaged in a function pq <- length(XY.cca$cor) clrs <- sample(colors()[25:60], size=pq) # Random colors plot(c(0,nrow(XY.cca$xcoef)+0.5), c(-1,1)*max(abs(XY.cca$xcoef)), xaxt="n", xlab="X-Variables", ylab="CCA X-Coeffs", type="n") abline(h=0, col="gray") for(i in 1:pq) lines(1:nrow(XY.cca$xcoef), XY.cca$xcoef[,i], lwd=2, col=clrs[i]) for(i in 1:pq) points(1:nrow(XY.cca$xcoef), XY.cca$xcoef[,i], pch=16, col=clrs[i]) text(rep(1,pq), XY.cca$xcoef[1,], names(XY.cca$cor), adj=1.2) axis(side=1, at=1:nrow(XY.cca$xcoef), lab=rownames(XY.cca$xcoef)) . CVs: In which frames do you see the canonical correlations? This is a considerable trick question!!! windows(); lims <- c(-4,4) pairs.plus(cbind(XY.cca$xCV, XY.cca$yCV), cex=.5, xlim=lims, ylim=lims) Remind yourself of the CCA structure: round(cor(cbind(XY.cca$xCV, XY.cca$yCV)), 3) Go back to the pairwise plots and wonder why some of them show zero correlations! Here is version with some jitter added: pairs.plus(cbind(XY.cca$xCV, XY.cca$yCV)+runif(len(c(XY.cca$xCV, XY.cca$yCV)),-.2,+.2), pch=".", cex=1.5, xlim=lims, ylim=lims) Do you see the zero correlations better now? - Permutation inference for canonical correlations: . Plot the profile, which has only length 3 in the example: N <- nrow(mktg) sel <- sample(N, size=400) # sel <- T # select all X <- mktg[sel,4:8] Y <- mktg[sel,1:3] XY.cca <- cca(X,Y,CV=T) pq <- length(XY.cca$cor) plot(XY.cca$cor, pch=16, cex=2, type="b", ylim=c(0,1.2*max(XY.cca$cor)), xlim=c(0,pq+1), ylab="Canonical Correlations", xaxt="n", xlab="Order of Canonical Correlations") axis(side=1, at=1:pq, lab=paste("CC",1:pq,sep="")) abline(h=0, col="gray") . Permutation inference: H0 = ... Spaghetti decoration: add to the above for(i in 1:100) lines(cca(X,Y[sample(nrow(Y)),])$cor, col="gray") lines(XY.cca$cor, lwd=2); points(XY.cca$cor, pch=16, cex=2) Vary 'size=...' above: 25, 100, 400, 1600 . Questions: + Do we have a budget constraint on the canonical correlations? ... + Is it possible that all can.corrs. = 1? What would it mean? ... + Implications for testing the canonical correlations with an independence assumption? ... - Bootstrap inference for canonical correlations: . BS simulation: Nboot <- 1000 XY.cca.bs <- replicate(Nboot, { sel <- sample(nrow(X), replace=T) cca(X[sel,], Y[sel,])$cor }) dim(XY.cca.bs) XY.cca.bs.q.lo.hi <- apply(XY.cca.bs, 1, quantile, prob=c(.025,.975)) XY.cca <- cca(X,Y) # observed . Graphical presentation: add bias/skewness-correcting CIs to the above plot lines(c(rbind(1:pq,1:pq,NA)), c(rbind(pmax(2*XY.cca$cor - XY.cca.bs.q.lo.hi[1,], 0), pmax(2*XY.cca$cor - XY.cca.bs.q.lo.hi[2,], 0), # Why pmax(...,0)? NA)), lwd=2, col="red") . Same for coefficients: XY.cca.xcoef.bs <- replicate(Nboot, { sel <- sample(nrow(X), replace=T) cca(X[sel,], Y[sel,])$xcoef }) dim(XY.cca.xcoef.bs) XY.cca.xcoef.bs.q.lo.hi <- apply(XY.cca.xcoef.bs, c(1,2), quantile, prob=c(.025,.975)) dim(XY.cca.xcoef.bs.q.lo.hi) XY.cca <- cca(X,Y) # observed # Print for each CV the lower bound, the estimate, and the upper bound: for(j in 1:ncol(XY.cca$xcoef)) { cat("\nCV",j,":\n") print(cbind(Lower = 2*XY.cca$xcoef[,j] - XY.cca.xcoef.bs.q.lo.hi[2,,j], Coeff = XY.cca$xcoef[,j], Upper = 2*XY.cca$xcoef[,j] - XY.cca.xcoef.bs.q.lo.hi[1,,j] )) } ================================================================ * GENERALIZED CANONICAL ANALYSIS -- GCA: - Consider a three-block situation . Data version: three matrices X (Nxp), Y (Nxq), Z (Nxr) Assume all columns centered. Lin combs: X a, Y b, Z c . Population version: Xvec (px1), Yvec (qx1), Zvec (rx1) Assume all random variables centered. Lin combs: a^T Xvec, b^T Yvec, c^T Zvec . Proceed with data version in what follows. . Goal: Find linear combinations of the three blocks that somehow maximally correlate with each other... Q: What could that mean ????? - Block-internal correlation structure: . Should be irrelevant, as in CCA ==> sphere the blocks. . Sphering/pre-whitening: Xpw = X CXX^{-1/2} ==> Xpw^T Xpw / (N-1) = I Ypw = Y CYY^{-1/2} ==> Ypw^T Ypw / (N-1) = I Zpw = Z CZZ^{-1/2} ==> Zpw^T Zpw / (N-1) = I aa = CXX^{1/2} a ==> Xpw aa = X a bb = CYY^{1/2} b ==> Ypw bb = Y b cc = CZZ^{1/2} c ==> Zpw cc = Z c . If |aa|^2 = 1, then V[Xpw aa] = 1; same for |bb|^2 = 1 and |cc|^2 = 1. - Approach 1: . Criterion for 3-block GCA: Oh, well, how about this? crit(a,b,c) := cor(Xa,Yb) + cor(Xa,Zc) + cor(Yb,Zc) . Pre-whitened: cor(Xa,Yb) = cor(Xpw aa, Ypw bb) = aa^T Xpw^T Ypw bb / (|aa||bb|) cor(Xa,Zc) = cor(Xpw aa, Zpw cc) = aa^T Xpw^T Zpw cc / (|aa||cc|) cor(Yb,Zc) = cor(Ypw bb, Zpw cc) = bb^T Ypw^T Zpw cc / (|bb||cc|) . Stationary solutions wrt aa, bb, cc: 'grad_aa' = gradient wyrt aa grad_aa [ cor(Xpw aa, Ypw bb) + cor(Xpw aa, Zpw cc) + cor(Ypw bb, Zpw cc) ] = grad_aa [ cor(Xpw aa, Ypw bb) + cor(Xpw aa, Zpw cc) ] = grad_aa [ aa^T Xpw^T Ypw bb / (|aa||bb|) + aa^T Xpw^T Zpw cc / (|aa||cc|) ] = grad_aa [ (aa/|aa|)^T Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) ] = grad_aa [ (aa/|aa|)^T Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) ] = grad_aa [ (aa/|aa|)^T v ] where v = all to the right on the previous line = ( |aa| v - (aa^T v) aa/|aa| ) / |aa|^2 = 0 iff v = (aa^T v)/|aa|^2 aa where v = Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) ==> Xpw^T ( Ypw bb/|bb| + Zpw cc/|cc| ) = const(aa) * aa Ypw^T ( Xpw aa/|aa| + Zpw cc/|cc| ) = const(bb) * bb Zpw^T ( Xpw aa/|aa| + Ypw bb/|bb| ) = const(cc) * cc ==> Not a set of eigen equations, whichever way you look at it, though this might be an interesting problem, too. What makes this NOT an eigenproblem? + The independent normalizations aa/|aa|, bb/|bb|, cc/|cc| in each block. + The different 'eigenvalues' const(aa), const(bb), const(cc) in each block of equations. - Approach 2: . Learn from Approach 1 and modify the stationary equations so they become eigenequations. Xpw^T ( Ypw bb + Zpw cc ) = alpha * aa Ypw^T ( Xpw aa + Zpw cc ) = alpha * bb Zpw^T ( Xpw aa + Ypw bb ) = alpha * cc . Recall: Xpw^T Ypw /(N-1) = CXX^{-1/2} CXY CYY^{-1/2} =: CXY.pw Similar for the other two cross-covariances. . So the above system is the eigen equation for this matrix: | 0 CXY.pw CXZ.pw | | CYX.pw 0 CYZ.pw | | CZX.pw CXY.pw 0 | . We may add the identity and obtain the same eigenvectors, with eigenvalues shifted by +1: | I CXY.pw CXZ.pw | | CYX.pw I CYZ.pw | | CZX.pw CXY.pw I | These identity matrices can be interpreted as CXX.pw, CYY.pw, CYY.pw: | CXX.pw CXY.pw CXZ.pw | | CYX.pw CYY.pw CYZ.pw | | CZX.pw CXY.pw CZZ.pw | . The eigen equations: | CXX.pw CXY.pw CXZ.pw | |aa| = beta * aa | CYX.pw CYY.pw CYZ.pw | |bb| = beta * bb | CZX.pw CXY.pw CZZ.pw | |cc| = beta * cc . Associated Rayleigh quotient: V[ X.pw aa + Y.pw bb + Z.pw cc ] ------------------------------------------ V[ X.pw aa ] + V[ Y.pw bb ] + V[ Z.pw cc ] - Larger conclusion: --------------------------------------------------------------- | | | A NULL COMPARISON PRINCIPLE OF MULTIVARIATE ANALYSIS: | | | | Classical multivariate analysis is a variance comparison, | | one variance being the of observed linear combinations, | | the other being the variance calculated under an assumption | | of absent correlation. | | | --------------------------------------------------------------- . Special cases: Null/denominator variances for PCA, CCA and GCA o PCA: Var.0(X a) = sum aj^2 Var(Xj) o CCA: Var.0(X a + Y b) = Var(X a) + Var(Y b) o GCA: Var.0(X a + Y b + Z c) = Var(X a) + Var(Y b) + Var(Z c) . Correlation- vs covariance-based PCA in the null comparison framework: Var.0(X a) = sum aj^2 Var(Xj) results in correlation-based PCA. Covariance-based PCA can be covered by adding to the null assumption Var(Xj) = same (j=1...p) The null variance becomes: Var.0(X a) = sum aj^2 * ave(Var(Xj)) I.e., Var.0(X a) = sum aj^2, the Euclidean norm, as ave(Var(Xj)) is just a known constant. . Observations for GCA: + Unlike the two-block CCA problem, the three denominator variances will not generally be equal at stationarity. Example: If the Z block is uncorrelated from the X and Y blocks, then stationary solutions involving X and Y will set cc=0. + The three CCA problems of one block vs the union of the two others, in general do not produce the three-block GCA problem: CCA of X vs (Y,Z) CCA of Y vs (X,Z) CCA of Z vs (X,Y) GCA of X vs Y vs Z These are in general 4 separate problems. Which one is chosen depends on subject matter. + Lesson from Approach 1 and Approach 2: Do not obsess over one formulation of a problem. If a problem formulation is unwieldy, try another one that might be more manageable and whose solutions provide relevant answers. - Code for GCA: gca <- function(..., center=T, scale=T, GCV=FALSE) { # GCV=TRUE: want GCV variates blocks <- list(...) if(length(blocks)==1 & is.list(blocks[[1]])) blocks <- blocks[[1]] if(length(blocks)<2) { cat("Fewer than 2 blocks provided.\n"); return() } for(i in 1:length(blocks)) blocks[[i]] <- as.matrix(blocks[[i]]) N <- unique(sapply(blocks, nrow)) if(length(N)!=1) { cat("Row numbers of blocks not the same.\n"); return() } blocks.s <- lapply(blocks, scale, center=center, scale=scale) roots <- lapply(blocks.s, function(X) { CXX <- crossprod(X)/(nrow(X)-1); CXX.eig <- eigen(CXX) RX <- CXX.eig$vec %*% diag(CXX.eig$val^{ 1/2}) %*% t(CXX.eig$vec) RX.inv <- CXX.eig$vec %*% diag(CXX.eig$val^{-1/2}) %*% t(CXX.eig$vec) list(raw=RX, inv=RX.inv, eig=CXX.eig) } ) blocks.pw <- lapply(1:length(blocks), function(j) blocks.s[[j]] %*% roots[[j]]$inv ) # all.pw.cbind <- matrix(unlist(blocks.pw), nrow=N) # Better ways to cbind from a list??? # YES! Thanks to Josh: 'do.call()' all.pw.cbind <- do.call(cbind, blocks.pw) block.indeces <- rep(1:length(blocks.pw), sapply(blocks.pw, ncol)) COV <- cov(all.pw.cbind) GCA <- eigen(COV) names(GCA$values) <- paste("CV", 1:length(GCA$val), sep="") colnames(GCA$vectors) <- paste("CV", 1:ncol(GCA$vectors), sep="") for(j in unique(block.indeces)) GCA$vectors[block.indeces==j,] <- roots[[j]]$inv %*% GCA$vectors[block.indeces==j,] rownames(GCA$vectors) <- paste("(",block.indeces,") ",unlist(sapply(blocks,colnames)),sep="") colnames(GCA$vectors) <- paste("CV", 1:ncol(GCA$vectors), sep="") if(GCV) { GCA$CV <- GCA$vectors } GCA } # Example application: mktg.gca <- gca(mktg[,1:3], mktg[,4:6], mktg[,7:8]) mktg.gca # Yes, you can interpret the coefficients as 'strengths' # because by default 'gca()' standardizes all columns. - Correspondence between CCA and GCA: Two-block GCA should agree with CCA. XY.cca <- cca(mktg[,1:3], mktg[,4:8]) XY.gca <- gca(mktg[,1:3], mktg[,4:8]) XY.cca; XY.gca ==> The canonical correlation values are the first min(p,q) GCA eigenvalues minus 1. The GCA eigenvalues below +1 are 1-canonical correlations in reverse order. The GCA eigenvalues between min(p,q) from both ends are exactly +1. ==> GCA gives the same information as CCA. We can extract the first min(p,q) GCVs and toss the rest. Compare the coefficients: windows(w=2.7, h=8) par(mfrow=c(3,1), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:3) plot.plus(cbind(c(XY.cca$xcoef[,j]), c(XY.gca$vectors[1:3,j]))) for(j in 1:3) plot.plus(cbind(c(XY.cca$ycoef[,j]), c(XY.gca$vectors[4:8,j]))) ==> Up to scaling the coefficients are the same. ================================================================ * FISHER''S OPTIMAL SCORING: CCA APPLIED TO CATEGORICAL DATA - Consider two categorical variables such as mktg[,c("edu","seg")] Well, "edu" is ordinal, but still. - Q: How do we measure association between two categorical variables? (They can be nominal or ordinal.) A: ... Standard answer: the chi-squared statistic for independence Chi-squared recipe: . tabulate . calculate expected counts under independence . sum up over all cells: (obs.count - exp.count)^2/exp.count . compare with a chi-squared distribution with ... degrees of freedom . Stine-Foster transform chi-squared to Cramer''s V for normalization ... - Alternative: correlation . Naive: if the levels are ordinal, score them with integers 1,2,...; use the integer scores as if the variables were quantitative. . Idea: Assign scores to the categories, then treat the variable as quantitative. . Next idea: Optimize the scores of both variables wrt correlation. ==> Fisher''s optimal scoring! - Execution of the idea of Fisher''s optimal scoring: . Introduce two blocks of dummy variables associated with the two categorical variables: X <- cbind(....) Y <- cbind(....) . If the scores for the categories are collected in vectors 'a' and 'b', then the scored variables are: X a and Y b . Maximize 'a' and 'b' wrt to correlation: cor(X%*%a, Y%*%b) - Example: Dummify (spelling?) a categorical variable -- R coding make.dummies <- function(categ) { # Thanks, Josh! categ.f <- factor(categ) # Make sure the categorial variable is a factor. X <- do.call(cbind, lapply(levels(categ.f), function(lev) as.numeric(categ.f==lev))) colnames(X) <- levels(categ.f) # cbind() does not see the column labels rownames(X) <- names(categ) # Carry over the names if any. X } data.frame(mktg[1:10,"edu"], make.dummies(mktg[1:10,"edu"])) # Seems to work... - Example: Try CCA on the dummy blocks of two categorical variables. There shall be issues! x <- mktg[,"edu"] y <- mktg[,"age"] X <- make.dummies(x) Y <- make.dummies(y) cca(X, Y) # Where could this have gone wrong? # ... # Remember the issue with a complete set of dummies? # ... # How does it affect CCA? # ... # What is the standard solution? # ... XY.cca <- cca(X[,-1], Y[,-1]) XY.cca XY.cca <- cca(X[,-2], Y[,-2]) XY.cca # New issue: We are now missing the score # for the left-out dummy. What is it? # (Recall how the CVs are computed!) # ... # ==> This is an asymmetric way of scoring, # making the left-out category the base. - Example, same data but another approach: # Don't center, only scale the dummies! # (Check the code for cca() earlier; it accommodates # scale(..., center=T/F, scale=T/F).) XY.cca <- cca(X, Y, center=F, scale=F, CV=T) XY.cca[1:3] # Ignore the variates; corrs and coeffs only ## $cor ## CV1 CV2 CV3 CV4 CV5 ## 1.00000000 0.21718116 0.08657873 0.06404319 0.02222852 ## $xcoef ## CV1 CV2 CV3 CV4 CV5 ## 0 -0.9997404 -6.270201355 1.74918526 7.0569367 -9.42699080 ## 1 -0.9997404 -5.861549967 1.22444403 5.0793966 -4.41281153 ## 2 -0.9997404 -2.994472073 -1.97064994 0.8012265 3.60216868 ## 3 -0.9997404 -0.511181415 1.11051688 -0.9943952 0.01677355 ## 4 -0.9997404 0.310283343 0.13067035 0.4708330 0.26974546 ## 5 -0.9997404 1.155826412 0.07483341 1.0816856 0.12792218 ## 6 -0.9997404 -0.002502345 -1.64999925 -0.8206683 -1.00345941 ## $ycoef ## CV1 CV2 CV3 CV4 CV5 ## 0 -0.9997404 1.24516310 -0.2394911 2.3577747 1.5899922 ## 1 -0.9997404 0.86168024 1.7881059 -0.4111201 -0.4244355 ## 2 -0.9997404 0.74751366 -1.2375788 -0.1148203 -1.2301481 ## 3 -0.9997404 -0.03049887 -0.5300292 -1.4477710 1.4623189 ## 4 -0.9997404 -1.36176958 0.1514422 0.4519488 -0.2095481 # Tell the story of the second CCA!!! Age vs Edu # What do you expect the first CV to be? range(XY.cca.$xCV[,1]) range(XY.cca$yCV[,1]) # Explain: ... # Hence where do you expect to be the first non-trivial CV? plot.plus(cbind(-XY.cca$xCV[,2], -XY.cca$yCV[,2]), jitter=.5, xlab="edu", ylab="age", col=c("red","blue","green","brown")[mktg[,"seg"]]) abline(lm(XY.cca$yCV[,2] ~ XY.cca$xCV[,2]), col="gray", lwd=2) - Comparison of removing a dummy and not centering: . Is there a difference in scores for the categories as found in cca()$xcoef and cca()ycoef? ... cca(X[,-1], Y[,-1]) cca(X, Y, center=F) Yes, there is because + when removing a dummy, the base score is forced to zero; + when not centering, the CVs must have zero cross-moment with (1,1,1,...) because this is the first uncentered CV. . Is there a difference in CVs as found by cca()$xCV and cca()$yCV? XY.cca.0 <- cca(X[,-1],Y[,-1], CV=T) XY.cca.2 <- cca(X,Y,center=F, CV=T) pairs.plus(cbind(XY.cca.0$xCV, XY.cca.2$xCV)) Apparently not! Have a better look: windows(w=3, h=9) par(mfrow=c(3,1), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:3) { plot.plus(cbind(XY.cca.0$xCV[,j], XY.cca.2$xCV[,j+1]), jitter=0) abline(a=0, b=1); abline(a=0, b=-1) } . Q: How is it possible that the CVs are the same when the base category was scored with zero? That''s a tough one! ... A: When we drop one dummy, CCA centers the dummies!!! It isn''t the left-out dummy that has a zero coefficient; rather, it is its centered version!!! Centering of the included dummies moves the score of the base category away from zero. - Summary: . Fisher''s optimal scoring describes the association between two categorical variables in terms of correlation beween optimally scored categories in both variables. . The complete description of the association is given by + the hierarchy of scores and + the hierarchy of optimal correlations, of which there is more than one when both variables have more than one category. - Q: How would you describe the strength of the association? Is the optimal correlation the only way? XY.cca.2$cor What about the sum of stationary correlations squared? sum(XY.cca.2$cor[-1]^2) Show that this equals the squared Frobenius norm | CXX^{-1/2} CXY CYY^{-1/2} |^2 In some sense this is "total linear association" between the two blocks. - Matt''s observation: X^T Y can be interpreted as ... ================================================================ * MORE BACKGROUND ON CCA - Centering versus appending intercept columns: . In order to write the CCA problem with lin alg, we said we would center all the columns: X -> X.c, Y -> Y.c The CCA problem then becomes (X.c a)^T (Y.c b) cor(X a, Y b) = ----------------- = max/stationary_{a,b} |X.c a| * |Y.c b| . Alternatively, we can augment X and Y both with an intercept column: X -> X.i = (1,X), Y -> Y.i = (1,Y) The new CCA problem is now in terms of uncentered cross-moments: (X.i a)^T (Y.i b) cross(X.i a, Y.i b) = ----------------- = max/stationary_{a,b} |X.i a| * |Y.i b| This problem is equivalent to the correlation/centering-based problem because it generates a trivial solution 1=1 with perfect cross()=1, but all subsequent solutions will be orthogonal to 1, hence centered. . In the future we will therefore write X and Y but mean either a centered version or an intercept-augmented version. . Other variants of the idea include categorical variables with a complete block of dummies that are collinear with the intercept vector. . Q: Remind yourself, what is the notion of orthogonality in CCA? A: Orthogonal means the variates are uncorrelated or have zero cross-moment. The metric is |X a|, not |a|, and |Y b|, not |b|. Correspondingly orthogonality means (X a)^T (Y b) = 0, NOT a^T b = 0. - Equivalence of LS and max-corr for regression: Consider a regression situation, modeling y ~ X a. Consider three criteria: RSS(a) = |y - X a|^2 (X centered or with intercept) y^T X a cross(a) = --------- (Interpretation?) |y| |X a| |y - X a|^2 sph(a) = ----------- (Interpretation?) |X a|^2 Fact 0: argmin cross(a) is determined only up to a factor > 0. Fact 1: argmin cross(a) ~ argmin RSS(a) up to a factor > 0. Proof: Stationarity of cross(a) yields ... X^T y - ... X^T X a = 0 where ... are scalars. It follows a ~ (X^T X)^{-1} X^T y up to a factor.. Fact 2: argmin sph(a) ~ argmin RSS(a) up to a factor > 0. Proof: Stationarity of sph(a) yields (-2 X^T y + 2 X^T X a) ... - ... (2 X^T X a) = 0 where again ... are scalars. It follows a ~ (X^T X)^{-1} X^T y up to a factor.. - Extension of the equivalence to CCA: . The CCA problem is b^T Y^T X a cross(a) = ------------ |Y b| |X a| . Compare with the asymmetric LS criterion: |Y b - X a|^2 LS(a,b) = ------------- |Y b|^2 . Note: LS(b) = min_a LS(a,b) determines b only up to a factor. . Stationarities: d/da cross = 0: a ~ (X^T X)^{-1} X^T Y b d/db cross = 0: b ~ (Y^T Y)^{-1} Y^T X a d/da LS = 0: a = (X^T X)^{-1} X^T Y b d/db LS = 0: b ~ (Y^T Y)^{-1} Y^T X a ==> Directionally the same solutions. . Corollary: CCA can be solved by iterative regressions. . Minimize explicitly min_a LS(a,b): .................... min_a |Yb - Xa|^2/|Yb|^2 = |Yb - H_X Yb|^2 / |Yb|^2 = |(I-H_X)Yb|^2 / |Yb|^2 Partial wrt b: ~ Y^T (I-H_X) Y b * ... - ... * Y^T Y b = 0 ==> Y^T (I-H_X) Y b ~ Y^T Y b ==> (Y^T Y)^{-1} Y^T (I-H_X) Y b = ... b (eigen equation for b) ==> Y (Y^T Y)^{-1} Y^T (I-H_X) Y b = ... Yb (eigen equation for b) ==> H_y (I-H_X) Y b = ... Yb (eigen equation for b) ==> Yb - H_Y H_X Yb = ... Yb ==> H_Y H_X Yb = ... Yb ==> Yb is eigenvectors of H_Y H_X Do they exist??? Surprise: H_Y H_X: cspan(Y) -> cspan(Y) is symmetric and >=0 ! Proof: ... ================================================================ * FISHER''S LINEAR DISCRIMINANT ANALYSIS: - Data situation: . Grouped data, start with TWO groups (for now). . Formalize the data as coming from two populations. - Question 1: If we could observe only a single feature X to tell the populations/groups apart, what characteristics would we like it to have? Assume you know the class-conditional densities p(x|1) and p(x|0). ... - Question 2: If we were only told the two group means mu1 and mu2 and the two variances sig1^2 and sig2^2 of the quantitative feature X, what characteristics would we like them to have? ... - Question 3: Given p variables (collected in Xvec) characterizing the members of the two groups, if you could not observe Xvec but had the choice to make up an arbitrary univariate function f(Xvec) for telling the groups apart, what would f(Xvec) be? Again, assume you knew p(xvec|1) and p(xvec|0). ... - Question 4: Given p features/attributes/variables Xvec on the members of the two groups, if we only knew E(Xvec|0)=muvec0 and E(Xvec|1)=muvec1 and V(Xvec|0)=Sig0 and E(Xvec|1)=Sig1, and if you were given the choice to make up a linear (!) function of Xvec to tell the groups apart, what would that function beta^T Xvec be? ... - If you answered Q4 in Fisher''s way, you''d have a generalized Rayleigh quotient to maximize. If you don''t have it in Rayleigh form yet, get it there. It should be a ratio of two quadratic forms. ... [Fisher called the numerator "between-groups variance" and the denominator "within-groups variance".] a^T (mu1 - mu0)(mu1 - mu0)^T a ------------------------------ a^T (Sig1 + Sig0) a - Execute the proper sphering to reduce the generalized eigenproblem to a simple eigenproblem. ... a = (Sig1 + Sig0)^{-1/2} at at = (Sig1 + Sig0)^{+1/2} a at^T (Sig1 + Sig0)^{-1/2} (mu1 - mu0)(mu1 - mu0)^T (Sig1 + Sig0)^{-1/2} at -------------------------------------------------------------------------- at^T at - What rank is the matrix in the Rayleigh numerator? ... - Use the rank info to find the explicit solution to the eigenproblem, then backtransform. ... 1 eigenvector at ~ (Sig1 + Sig0)^{-1/2} (mu1 - mu0) backtransformed a ~ (Sig1 + Sig0)^{-1} (mu1 - mu0) - Deviating from Fisher, conventional linear discriminant analysis or "LDA" assumes the covariance matrices Sig0 and Sig1 are the same: Sig0 = Sig1 =: Sig Execute the simplification that results for the best linear function of the variables to tell the groups apart: ... a ~ Sig^{-1} (mu1 - mu0) Comment: As illuminating the 2-class case is, we don''t see the connection with CCA yet. It is better to deal with the multi-class case. ---------------- - Multiclass Discriminant Analysis (linear or quadratic): . Program: Make idealized model calculations for the general multi-class classification problem. . Model: For covariates x (a random vector, xvec earlier) and classes j=1,...J, we assume we know the class-conditional distribution of x: p(x|j) For each class we are also know a prior p(j) Think of it as the marginal distribution of the classes (nothing Bayesian here). ==> Statistical decision problem with a discrete parameter 'j'. . The Bayes procedure is to pick the j that has highest posterior probability given x: p(j|x) = p(x|j)*p(j) / sum_{k=1...J} p(x|k)*p(k) (Bayes theorem) jhat(x) <- argmax_j p(j|x) = argmax_j p(x|j)*p(j) = argmin_j [ -log( p(x|j) ) - log(p(j)) ] . Task: Assume the covariates to be multivariate normal. p(x|j) = N(mu.j, Sig.j) Initially allow different Sig.j in each group. Later specialize to a common Sig = Sig.j. Calculate jhat(x)! ... (Too tedious for print.) jhat(x) = argmin_j [ (x-mu.j)^T Sig.j^{-1} (x-mu.j) + log det Sig.j - log p(j) ] ==> 'Nearest class center classifier' where 'nearest' is in the sense of Mahalonobis distance for each class, with penalties for o large Sig.j: + log det Sig.j o small class prior: - log p(j) . Assume now identical Sig.j =: Sig: jhat(x) = argmin_j [ -2 mu.j)^T Sig^{-1} x + mu.j^T Sig^{-1} mu.j - log p(j) ] ==> Still a 'Nearest class center classifier', penalized for small class size, but it turns out to be based on linear functions, hence creates linear separation boundaries. . Q: What are the classification boundaries for general Sig.j? What are the classification boundaries for shared Sig? ... A: Quadratic decision boundaries for general Sig.j, Linear decision boundaries for shared Sig. ==> Quadratic Discriminant Analysis or QDA vs Linear Discriminant Analysis or LDA . Q: Think through the decision boundaries in LDA for 2 classes! ... . Q: Think through the decision boundaries in LDA for 2, 3 and 4 classes! ... . Estimation: o The above optimal procedures assume that mu.j, Sig.j or Sig are known. o In practice they need to be estimated. ==> Require a "large N & small p" regime. ("Large p" requires additional assumptions and regularizations.) . Q: What are the trade-offs between QDA and LDA? ... . Even for LDA there might be too many parameters to estimate. Generate some ideas to reduce their number! + Estimate mu.j^T Sig^{-1}, not Sig^{-1} + Naive Bayes: assume Sig = diagonal + Ridge: regularize Sig^{-1} by replacing it with (Sig + lambda*Omega)^{-1} (Omega fixed, e.g. diagonal > 0) . Q: What is the posterior probability p(j|x) in LDA under normal assumptions? A: exp( - 2*mu.j^T Sig^{-1} x + mu.j^T Sig^{-1} mu.j ) * p(j) p(j|x) = ----------------------------------------------------------------- sum_k exp( - 2*mu.k^T Sig^{-1} x + mu.k^T Sig^{-1} mu.k ) * p(k) It is function that estimates the x-conditional class probabilities, not the linear function mu.j^T Sig^{-1} x. The formula for p(j|x) provides the link function that maps the linear function mu.j^T Sig^{-1} x to actual probabilities. - Covariances: . Define three covariance matrices: + Total (marginal): Sig.T := C[x] (x = predictor random vector) + Within (conditional): Sig.W := sum_j C[x|j]*p(j) = Sig in LDA + Between: Sig.B := C[mu.j] (j is random) (See below for calculations that show what these are.) . Comments: + Sig.W can have two interpretations: o Not assuming shared Sig.j, it is the average Sig.j. o Assuming shared Sig.j as in LDA, it is the conditional C[x|j]. Idea: Following Fisher, you can avoid the shared Sig assumption and work with an average Sig instead and get (suboptimal) linear boundaries and decision rules, thereby avoiding the optimal but more parameter-rich QDA. + mu.j is a random vector with distribution p(j), which is why defining Sig.B as C[mu.j] is meaningful. + Identities: No normal assumptions, just definitions and 1st & 2nd order calculations. o mu.j = E[ x | j ] o mu = E[mu.j] = sum_j mu.j*p(j) = E[E[x|j]] = E[x] Wlog, we can assume the data 'x' to be centered, mu=0, if this is convenient. o Sig.B = E[ (mu.j - mu) (mu.j - mu)^T ] # j is random. = sum_j (mu.j - mu) (mu.j - mu)^T p(j) o Sig.W = E[ E[ (x - mu.j) (x - mu.j)^T | j ] ] = sum_j C[x|j] * p(j) LDA: same for all j Sig.W = E[ (x - mu.j) (x - mu.j)^T | j ] o Sig.T = E[ (x-mu)(x-mu)^T ] = E[ ((x-mu.j)+(mu.j-mu)) ((x-mu.j)+(mu.j-mu))^T ] # telescoping = E[ E[ ((x-mu.j)+(mu.j-mu)) ((x-mu.j)+(mu.j-mu))^T | j ] ] = E[ E[ (x-mu.j)(x-mu.j)^T + (mu.j-mu)(x-mu.j)^T + (x-mu.j)(mu.j-mu)^T # both zero! + (mu.j-mu)(mu.j-mu)^T | j ] ] = E[ E[ (x-mu.j)(x-mu.j)^T ] | j ] + E[ (mu.j-mu)(mu.j-mu)^T ] = E[ Sig.j ] + Sig.B = Sig.W + Sig.B ------------------------------ | | | Sig.T = Sig.W + Sig.B | | | ------------------------------ No assumption of identical C[x|j]! Sig.W is just the average C[x|j]. + Q: What rank is Sig.B? A: ... Sig.B is a weighted sum of rank one matrices (mu.j-mu)(mu.j-mu)^T. There are J vectors (mu.j-mu), but they span only a (J-1)-dim space due to sum_j (mu.j-mu) = 0. Therefore the rank of Sig.B is no more than (J-1). (It can be even less if the vectors (mu.j-mu) are in some other way collinear, e.g., if they form a perfect chain on a line. This is unlikely in practice. Your typical estimate of Sig.B is of rank (J-1).) - LDA Dimension Reduction -- Take 1: . Goal: dimension reduction! . Generalizing the thought experiment from last class, if you could only use K < J-1 dimensions (= linear combinations of x) for classification, what would you choose? . Fisher: Maximize the "between-centers" variation while keeping the "Within-class" variation small. a^T Sig.B a R1(a) = ----------- = max_a a^T Sig.W a . Execute using appropriate sphering. ... . Use of the eigenvectors from R1(a): + Sphere the data 'x' and centers: xt = Sig.W^{-1/2} x mut.j = Sig.W^{-1/2} mu.j + Obtain the eigenvectors 'at.k' of Sig.W^{-1/2} Sig.B Sig.W^{-1/2} + Project the sphered data 'xt' and centers 'mut.j' onto the 'at.k'. + Plot the projected centers: at.2^T mut.j vs at.1^T mut.j if a K=2 dim reduction is chosen. + Assign the projected covariate vectors xt to the nearest projected center, where 'nearest' is in Euclidean distance due to the use of sphered coords. + This projection plot will give you a sense of how reasonable the linear boundaries are and how spherically distributed the classes are. . Caution: There is no correspondence between the order of eigenvectors and classes! . An equivalent Rayleigh quotient: !!!!!!!!!!!!!! a^T Sig.B a R2(a) = ----------- = max_a a^T Sig.T a R2(a) and R1(a) are equivalent in the sense that they generate eigenproblems with the same eigen vectors. Proof: Stationarity conditions R1(a): 2* Sig.B a * ... - ... 2* Sig.W a = 0 R2(a): 2* Sig.B a * ... - ... 2* Sig.T a = 0 = 2* Sig.B a * ... - ... 2* (Sig.W+Sig.B) a - Estimation: . Data: xij ~ p(x|j) iid, j=1...J, i=1...N.j . Assumption: E[N.j/N] = p(j) !!!! May not be true either. . Estimates: Replace "E[]" and "E[|j]" with empirical means. ==> Put hats on mu''s and Sig''s. . Write down the estimation version of the max-posterior rule. ... . Is this rule finite sample optimal? ... - Connection with CCA: LDA Dimension reduction -- Take 2 . Setup: Dummify the groups: Y-block Covariates: X-block Y X 1 0 0 x111 x112 x113 ... 1 0 0 x211 x212 x213 ... 0 1 0 x121 x122 x123 ... 0 1 0 x221 x222 x223 ... 0 1 0 x321 x322 x323 ... 0 0 1 x131 x132 x133 ... . Recall an asymmetric CCA formulation: LS(a,b) = | Yb - Xa |^2 / | Xa |^2 = min_{a,b} Minimize wrt b: Yb = H_Y Xa = argmin_Yb LS(a,b) Numerator term with minimizing Yb: Xa - Yb = (I - H_Y) Xa The LS(a,b) problem becomes: min_b (min_a LS(a,b)) = a^T X^T (I - H_Y) X a / a^T X^T X a + Q: What does (I - H_Y) do? A: ... ==> (I - H_Y) X = groupwise centered X matrix ==> [(I - H_Y) X]^T [(I - H_Y) X] / N = Sig.W (estimated) + Q: What is X^T X? A: Assuming the columns are centered we have X^T X / N = Sig.T As a consequence the CCA problem | Yb - Xa |^2 / | Xa |^2 = min/stat_{a,b} is equivalent to the LDA problem a^T Sig.B a / a^T Sig.T a = max/stat_a - Final Q: Why/when would you even want to use fewer dimensions for classification? A: It is a form of regularization; it reduces parameters estimated. It may improve out-of-sample classification performance (to be found only with CV, not in-sample performance). - Example: Mktg data -- take the segments as the 4 classes X <- scale(mktg[,-9]) cl <- mktg[,9] Y <- make.dummies(cl) seg.cca <- cca(X, Y, CV=T, center=F, scale=F) str(seg.cca) plot.plus(seg.cca$xCV[,c(1,2)], col=c("red","blue","green","black")[cl]) seg.means <- t(sapply(su(cl), function(j) apply(seg.cca$xCV[cl==j,c(1,2)], 2, mean))) points(seg.means, col=c("red","blue","green","black"), pch=16, cex=4) Why are the segment separations so strikingly clear? ---------------------------------------------------------------- * MULTIPLE CORRESPONDENCE ANALYSIS: GCA APPLIED TO CATEGORICAL DATA - Plain correspondence analysis is really Fisher''s optimal scoring with some bells and whistles. It was a cult in France centered on Benzecri. - Multiple correspondence analysis gives one possible solution to the reasonable problem of how to describe the association between multiple categorical variables. - Try: mktg.mca <- gca(make.dummies(mktg[,'age']), make.dummies(mktg[,'edu']), make.dummies(mktg[,'seg'])) round(mktg.mca$val, 3) round(mktg.mca$vec, 3) # Check the zero eigenvectors: something went wrong # ==> The way the gca calculates matrix roots for prewhitening (using eigen decomps) # has enough rounding error to create numerically non-singular matrices. # So it doesn't bomb, instead gives a largely correct solution except for the zero-evals. # The following is correct: mktg.mca <- gca(make.dummies(mktg[,'age']), make.dummies(mktg[,'edu']), make.dummies(mktg[,'seg']), center=F, scale=F) round(mktg.mca$val, 3) round(mktg.mca$vec, 3) ==> Gives reasonably monotone scores to 'age' and 'edu', and potentially reasonable scores to the nominal 'seg'. ---------------------------------------------------------------- * AN ALTERNATIVE TO MULTIPLE CORRESPONDENCE ANALYSIS: 'PRINCALS' OR PCA WITH ALTERNATING LS SCORING - Recount: . Problem: We have some categorical variables and we wish to perform PCA. . Naive approach: Use ordinal variables as quantitative, using sores 0, 1, 2... . Criticism of the naive approach: o It assumes the scores 0, 1, 2, ... make the ordinal variables quantitative. o For truly nominal non-binary data there is no remedy. - Idea of PRINCALS: Generate one fixed set of scores for K (< fhat, R^N --> R^N is linear, and hence a hat matrix exists. If the hat matrix is a projection, we denote it by H: fhat <- H %*% y If the hat matrix is not a projection, we denote it by S for 'smoother': fhat <- S %*% y + In real applications the hat matrix is not needed, but for understanding a smoothing method, it is useful. Q: How can we obtain a hat matrix if the software only provides fits? A: Apply the smoothing algorithm to all canonical basis vectors as y! S[,j] <- smoothing.algorithm(x, y=j_th_basis_vector) # Pseudo-R-code + All smoothers have a complexity or bandwidth parameter. They are linear smoothers only for a fixed choice of this parameter. Data-driven choice such as with cross-validation makes them non-linear. + Some R infrastructure: # Toy Data: The X and Y variables: x1 <- scale(mktg[,'in.']); hist(x, col="gray") y1 <- scale(mktg[,'rev']); hist(x, col="gray") plot(x1, y1, pch=16) # ==> Ties in x! It is important for smoothers to be able to handle ties. # Parallel sort according to x: ord <- order(x1) x <- x1[ord] y <- y1[ord] N <- length(x) rm(x1,y1) # Generate the j'th canonical basis vector: canon.basis.vec <- function(j,N) as.numeric(seq(N)==j) # Generate smoother matrix: make.S <- function(x, smoother, ...) { N <- length(x) S <- matrix(NA, nrow=N, ncol=N) for(j in seq(N)) S[,j] <- smoother(x, y=canon.basis.vec(j,N), ...) S } # Plot a basis discretized to the observed x values:: plot.cols <- function(x, mat, type="l", col="blue", lwd=2, pch=16, cex=.5) { ord <- order(x) par(mfcol=c(ceiling(ncol(mat)/2),2), mar=c(2,2,1,1), mgp=c(1.5,.5,0)) for(j in seq(ncol(mat))) { plot(x, mat[,j], type="n", ylab="", xlab="", ylim=range(c(0,mat[,j]))) abline(h=0, col="gray") if(grepl("[bl]",type)) lines(x[ord], mat[ord,j], col=col, lwd=lwd) if(grepl("[bp]",type)) points(x, mat[,j], col=col, pch=pch, cex=cex) } } # Plot selected rows of a hat/smoother matrix plot.rows <- function(sel, x, mat, type="l", col="red", lwd=2, pch=16, cex=.5) { ord <- order(x) x.rows <- mat[sel,]; x.loc <- x[sel] par(mfcol=c(ceiling(length(sel)/2),2), mar=c(3,3,1,1), mgp=c(1.5,.5,0)) for(j in seq(along=sel)) { plot(x, x.rows[j,], type="n", ylab="", xlab="", ylim=range(c(0,x.rows[j,]))) abline(h=0, col="gray"); abline(v=x.loc[j], col="gray") if(grepl("[bl]",type)) lines(x[ord], x.rows[j,ord], col=col, lwd=lwd) if(grepl("[bp]",type)) points(x, x.rows[j,], col=col, pch=pch, cex=cex) } } . PROJECTION SMOOTHERS: Expand X in a basis to form multiple predictors, then use multiple regression. Examples: + POLYNOMIALS: o Nice because they provide a hierarchy of complexity (the degree) but they have unfortunate nonlocal behaviors. o If you ever use polynomials, do not use monomial bases. They may be numerically unstable. Instead use orthogonal polynomial bases such as Hermite polynomials. They have easy recurrence relationships. You can later orthogonalize them wrt to the discretization of the observed x values. o R - Monomials, Hermite polynomials, and empirically orthogonalized polynomials: # Monomial basis, just for curiosity: Not recommended but works for low orders x.mon <- sapply(0:5, function(j) x^j) plot.cols(x, x.mon); rug(jitter(x,5)) # Orthogonalize empirically with a Q-R decomposition: x.mon.orth <- qr.Q(qr(x.mon)) round(crossprod(x.mon.orth), 14) # Check orthogonality, same as t(...)%*%(...) plot.cols(x, x.mon.orth) # Recursion for Hermite polynomials: x.herm <- matrix(NA, ncol=6, nrow=length(x)) x.herm[,1] <- 1 x.herm[,2] <- x for(j in 3:ncol(x.herm)) x.herm[,j] <- x*x.herm[,j-1] - (j-1)*x.herm[,j-2] plot.cols(x, x.herm) # Orthogonolization wrt to sampled points: use a Q-R decomposition x.herm.orth <- qr.Q(qr(x.herm)) max(abs(x.herm.orth - x.mon.orth)) # The same; why? plot.cols(x, x.herm.orth) # Hat matrix H.poly <- tcrossprod(x.herm.orth) # same as: x.herm.orth %*% t(x.herm.orth) sel <- seq(1,1926,by=275) # Select columns in steps of 275 plot.rows(sel, x, H.poly) # Plot weights at 8 locations (8 rows) rug(x) # Add a 'rug' to show where the x values fall. # Note the vertical lines -- they indicate the location for which this row provides fhat(x). # Discuss these traces! # Would it be justified to call polynomial regression a 'local smoother'? # ... fhat.poly <- H.poly %*% y windows() plot(x, y, pch=16, cex=.5); lines(x, fhat.poly, lwd=2, col="green") # Looks nice in spite of the unreasonable looking row weights. + B-SPLINES REGRESSION ('B'=basis): o Preferred over polynomials because they are more local. o Idea: Place K (<= knot.j) 1st degree B-splines: locally linear, continuous basis functions: 1, x, (x-knot.k)*(x >= knot.j) 2nd degree B-splines: quadratic linear, 1x differentiable basis functions: 1, x, x^2, (x-knot.k)^2*(x >= knot.j) 3rd degree B-splines: 'cubic splines', most popular, 2x differentiable basis functions: 1, x, x^2, x^3, (x-knot.k)^3*(x >= knot.j) o R - Generating discretized spline bases of any order and any set of knots: library(splines) nknots <- 9 knots <- quantile(x, prob=(1:nknots)/(nknots+1)); deg <- 3 # Ex. 2 x.bsp <- bs(x, knots=knots, Boundary.knots=range(x), degree=deg, intercept=T) plot.cols(x, x.bsp) # Orthogonalize: look like wavelets x.bsp.orth <- qr.Q(qr(x.bsp)); round(crossprod(x.bsp.orth), 14) plot.cols(x, x.bsp.orth); rug(x) # Hat matrix: H.bsp <- tcrossprod(x.bsp.orth) sel <- seq(1,1926,by=275) # Plot 8 rows = weights at 8 locations plot.rows(sel, x, H.bsp) rug(jitter(x,5)) # Would you call B-spline smoothers more or less local than polynomial regression? fhat.bsp <- H.bsp %*% y windows() plot(x, y, pch=16, cex=.5); lines(x, fhat.bsp, lwd=2, col="blue") rug(jitter(x,5)) + Fourier/trig bases: for stationary data or data with a major period ('wrap-around') + Wavelet bases: see Tony Kai for details . SMOOTHING SPLINES: NOT the same as B-splines! + Idea: Place knots at all observed values of X (assuming no ties). ==> Saturated basis. When fitting to a response Y, penalize with some generalized Ridge penalty. Actually, the following criterion defines cubic smoothing splines: sum_i (yi - f(xi))^2 + lambda * Integral f''(x)^2 dx (over a range) While the RSS only speaks to the values of f() at observed xi, the penalty integral speaks to the values of f() at all values x in the range. ==> There exists one function that minimizes the criterion, and it happens to be a cubic spline with knots at all observed xi. ==> Study Grace Wahba''s papers on "Reproducing Kernel Hilbert Space Theory" (RKHS theory). Interesting functional analysis (a field of math, inf-dim lin alg w topology) + Linear algebra version of smoothing splines: at observed xi only y = (y1,...,yN)^T, f = (f1,...,fN)^T |y - f|^2 + lambda * f^T Omega f = min_f where Omega is a non-negative definite generalized Ridge penalty. Omega is a function of x=(x1,...,xN)^T. Look up how to construct it from x. It looks like a weighted second difference penalty but sphered! + Exercise: For intuition, write down an Omega such that f^T Omega f = | D2 f |^2 where D2 is the 2nd difference operator: D2 f = (f1+f3-2*f2, f2+f4-2*f3, ...) in R^{N-2} assuming that the xi are sorted: x1 < x2 < ... < xN REMINDER: This is NOT a smoothing spline! It does NOT originate from the integral penalty! For one thing, it does not account for unequispaced x! In R: The following is a straight illustration of linear algebra. This is not how you implement production code. o Assume x is sorted, or better check. (We ignore that there are ties...) N <- length(x); all(x[-N] <= x[-1]) # Check that x is sorted. o Form a 2nd difference matrix function diff.1st.mat <- function(N) { diag(N)[-N,] - diag(N)[-1,] } diff.1st.mat(6) # Toy example diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] } diff.2nd.mat(6) # Toy example # Alternative: diff.2nd.mat <- function(N) { D2 <- matrix(0, nrow=N-2, ncol=N) D2[col(D2)==row(D2)] <- 1 D2[col(D2)==row(D2)+2] <- 1 D2[col(D2)==row(D2)+1] <- -2 D2 } o Note: For non-equispaced x one should really use differences (y{i} - y{i-1})/(x{i} - x{i-1}) (i=2,...,N), hence row entries should be not +-1 but +-1/(x{i}-x{i-1}) [Now you should realize the difficulty of programming splines: In the presence of ties or near-ties, the x values need to be bundled first before stable divisions by x-differences can be made.] o Form the associated penalty matrix Omega, representing f^T Omega f = |D2 f|^2 = f^T D2^T D2 f N <- length(x) Omega <- crossprod(diff.2nd.mat(N)) # (takes a while) dim(Omega) # quite big, of course Omega.eig <- eigen(Omega) # (takes a while) plot(Omega.eig$val, type="l") # Large values means function strongly penalized range(Omega.eig$val) plot.cols(x, Omega.eig$vec[,1:10]) # Strongly penalized plot.cols(x, Omega.eig$vec[,N-(0:9)], type="p") # Weakly penalized # (Steps are artifacts of improper treatment of ties: they should be bundled.) + 'Hat' matrix for smoothing splines: solve the generalized Ridge problem Stationarity wrt f: -2*y + 2*f + 2*Omega f = 0 ==> f = (I+Omega)^{-1} y ==> S = (I+Omega)^{-1} is the smoother or hat matrix. R: S.D2 <- solve(diag(N) + Omega) # Takes a minute! fhat.D2 <- S.D2 %*% y # This is quick fhat.D2 <- solve(diag(N) + Omega, y) # Obviates the need to form S.D2. plot(x, y, pch=16); lines(x, fhat.D2, lwd=2, col="red") # Ooops, we undersmoothed! How do we control the smoothness? # We seem to need a lot more of the penalty. + Analysis of S: o S is symmetric like Omega. o Omega and S have the same eigenvectors. o evals(S) = 1/(1+evals(Omega)) o evals(Omega) >= 0 ==> evals(S) in [0,1] o As Omega penalizes more, S shrinks more. o Zero penalty in direction u ==> Zero shrinkage in direction u that is, pure projection onto u. o Write the eigendecomp of S: S = U Lambda U^T = sum_k lambda.k u.k u.k^T o For cubic splines: lambda.1 = lambda.2 = 1 corresponding to u.1=1 and u.2=x. Why? Because both f(x)=1 and f(x)=x have vanishing properly x-weighted second differences. For unweighted 2nd differences the effective x is the order/rank of x. + For projections we have a notion of degrees of freedom: rank(H) Q: What would be a good notion of dfs for a smoothing spline smoother S? ... A: The trace of S. (There are alternative definitions, though.) ==> S = (I + lambda*Omega)^{-1} tr(S) = sum_i eval.i(S) = sum_i 1/(1+lambda*eval.i(Omega)) R: # Play with lambda till the trace of S is near the desired df=6. lambda <- 1; sum(1/(1+lambda*Omega.eig$val)) # Ooops, we used almost 750 dfs! # Search (preferably with bisection) till you, say, dfs~6: lambda <- 344020000; sum(1/(1+lambda*Omega.eig$val)) S.D2.df6 <- solve(diag(N) + lambda*Omega) # (takes a while) tr(S.D2.df6) # Ok, approximately 6, as targeted. fhat.D2.df6 <- S.D2.df6 %*% y fhat.D2.df6 <- solve(diag(N)+lambda*Omega, y) # Alternative w/o S windows() plot(x, y, pch=16); lines(x, fhat.D2.df6, lwd=4, col="red") # Not bad!!! (Except for improper handling of ties.) + R: # Don't calculate smoother matrices for smoothing splines! # Use functions that compute fits directly. apropos('spline') help('smooth.spline') par(mfcol=c(1,1)); plot(mktg[,c('in.','rev')], pch=16) lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df= 5), col="green", lwd=3) lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df= 10), col="blue", lwd=3) lines(smooth.spline(x=mktg[,'in.'], y=mktg[,'rev'], df= 20), col="red", lwd=3) + Hat/smoother matrix: S.smsp <- make.S(x, function(x,y) fitted(smooth.spline(x=x, y=y, df=10))) # (takes a while) max(abs(S.smsp - t(S.smsp))) # Symmetry check sel <- seq(1,1926,by=275) # Select rows in steps of 275 plot.rows(sel, x, S.smsp) # Plot row weights + Eigen analysis of the smoother: S.smsp.eig <- eigen(S.smsp) # (takes a while) # Look at 30 largest e'vals: cbind(S.smsp.eig$val[1:30]) # Eigen values: surprises? # Plot eigenvalue profile up to order 30: par(mfcol=c(1,1)) plot(S.smsp.eig$val[1:30], type="b", pch=16, ylab="E'vals"); abline(h=0) # Plot first 10 eigen vectors: interpret them as 'eigen functions' of the smoother plot.cols(x, S.smsp.eig$vec[,1:10]) # Looks like a system of orthogonal polynomials rug(jitter(x,5)) . LOCALLY KERNEL WEIGHTED POLYNOMIAL SMOOTHING: most often 'polynomial = line' + Idea: To obtain fhat(x0), fit a degree-p polynomial with kernel-weighted LS, where the a kernel is centered at x0 of interest. + For fixed degree p and fixed kernel width, the smoother is linear. However, the usual implementations have robustness or cross-validation features that make them nonlinear in y. + R function: loess(formula, data=...) [newer] or lowess(x, y) [older] A local polynomial smoother with robustness features (Bill Cleveland) help(lowess) # Plot data: par(mfcol=c(1,1)); plot(mktg[,c('in.','rev')], pch=16) lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.5), col="green", lwd=3) lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.2), col="blue", lwd=3) lines(lowess(x=mktg[,'in.'], y=mktg[,'rev'], f=.1), col="red", lwd=3) + Hat/smoother matrix: S.lowess <- make.S(x, function(x,y) lowess(x, y, f=.5)$y ) max(abs(S.lowess - t(S.lowess))) # Nearly symmetric but not quite plot.rows(sel, x, S.lowess) # Plot weights at a few locations + Eigen analysis of the smoother: S.lowess.eig <- eigen(S.lowess) # (Takes very long; works even though S is not symmetric.) cbind(round(S.lowess.eig$val[1:30],4)) # Eigen values: surprises? # Plot eigenvalue profile up to order 30: par(mfcol=c(2,1)) plot(Re(S.lowess.eig$val[1:30]), type="b", pch=16, ylab="E'vals"); abline(h=0) plot(Im(S.lowess.eig$val[1:30]), type="b", pch=16, ylab="E'vals"); abline(h=0) # Plot first 10 eigen vectors: interpret them as 'eigen functions' of the smoother plot.cols(x, Re(S.lowess.eig$vec[,1:10])) # Looks quite strange plot.cols(x, Im(S.lowess.eig$vec[,1:10])) # Looks quite strange rug(jitter(x,5)) + SVD analysis of the smoother: more appropriate S.lowess.svd <- svd(S.lowess) # Plot singular value profile up to order 30: plot(Re(S.lowess.svd$d[1:30]), type="b", pch=16, ylab="Singular valuess"); abline(h=0) # Right singular vectors: windows(); plot.cols(x, S.lowess.svd$u[,1:10]) # Left singular vectors: windows(); plot.cols(x, S.lowess.svd$v[,1:10]) # Plot left and right singular vectors against each other: windows(h=10, w=4); par(mfcol=c(5,2), mgp=c(1.5,.5,0), mar=c(1,1,1,1)); for(j in 1:10) plot(S.lowess.svd$u[,j], S.lowess.svd$v[,j], pch=16, cex=1, xlab="", ylab="") # Didn't know what to expect... + R: function supsmu(x, y) # J.H. Friedman & W. Stuetzle A local linear smoother with local cross-validation (J.H.Friedman and W.Stuetzle) ==> Fast and flexible, but may look raggedy. help(supsmu) par(mfcol=c(1,1)); plot(mktg[,c('in.','rev')], pch=16) lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.5), col="green", lwd=3) lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.2), col="blue", lwd=3) lines(supsmu(x=mktg[,'in.'], y=mktg[,'rev'], span=.1), col="red", lwd=3) Cumbersome to compute a smoother matrix because supsum() returns fewer values when there are ties. ---------------- * TRANSFORMATIONAL MA 0: NONLINEAR OPTIMAL CORRELATION AS A CCA PROBLEM - This is the analog of Fisher''s optimal scoring for two quantitative variables: max_{f,g} cor(f(X),g(Y)) We know this is really a CCA problem. How does this value compare with cor(X,Y)? See [Buja, AOS 18(3), 1990, pp. 1032-1069] for some population calculations. - Estimation of optimal correlation: . Data: Boston Housing site <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/" site <- "../A-STAT-541/Data/" boston <- read.table(paste(site,"boston.dat",sep=""), header=T) rownames(boston) <- paste(1:nrow(boston), scan(paste(site,"boston.geography",sep=""), w="") ) boston[1:5,] x.LSTAT <- boston[,"LSTAT"] y.MEDV <- boston[,"MEDV"] # Association: plot(x.LSTAT, y.MEDV, pch=16) cor(x.LSTAT, y.MEDV) . Polynomial estimation: # Monomial basis (with apologies): deg <- 5 x.LSTAT.mon <- scale(qr.Q(qr(sapply(1:deg, function(j) scale(x.LSTAT)^j)))) y.MEDV.mon <- scale(qr.Q(qr(sapply(1:deg, function(j) scale(y.MEDV)^j)))) dim(x.LSTAT.mon); dim(y.MEDV.mon) cca.MEDV.LSTAT.mon <- cca(x.LSTAT.mon, y.MEDV.mon) # Stationary correlations: round(cca.MEDV.LSTAT.mon$cor, 3) # Transformations = linear combinations: again, only the first is meaninful plot.cols(y.MEDV, y.MEDV.mon %*% cca.MEDV.LSTAT.mon$ycoef) plot.cols(x.LSTAT, x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef) # Plot transformed against each other: windows() j <- 1 xx <- x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef[,j] yy <- y.MEDV.mon %*% cca.MEDV.LSTAT.mon$ycoef[,j] plot(xx, yy, pch=16, cex=1) cor(xx, yy) cor(x.LSTAT, y.MEDV) . B-spline estimation: # B-spline bases: nknots <- 3 library(splines) # for bs() x.LSTAT.bsp <- scale(bs(x.LSTAT, knots=quantile(x.LSTAT, prob=(1:nknots)/((nknots+1))), Boundary.knots=range(x.LSTAT), degree=3, intercept=F ) ) # No intercept due to scaling y.MEDV.bsp <- scale(bs(y.MEDV, knots=quantile(y.MEDV, prob=(1:nknots)/((nknots+1))), Boundary.knots=range(y.MEDV), degree=3, intercept=F ) ) # No intercept due to scaling dim(x.LSTAT.bsp); dim(y.MEDV.bsp) cca.MEDV.LSTAT.bsp <- cca(x.LSTAT.bsp, y.MEDV.bsp) # Stationary correlations: round(cca.MEDV.LSTAT.bsp$cor, 3) # Transformations = linear combinations: only the first is meaningful plot.cols(y.MEDV, y.MEDV.bsp %*% cca.MEDV.LSTAT.bsp$ycoef) plot.cols(x.LSTAT, x.LSTAT.bsp %*% cca.MEDV.LSTAT.bsp$xcoef) # Transforms for orders 2 and higher complete an orthogonal system. . Compare with Breiman-Friedman''s ACE algorithm: install.packages("acepack", repos=repos) library(acepack) # Install first if you don't have it. # ACE uses a version of the nonlinear 'subsmu()' smoother (does local CV): ace.MEDV.LSTAT <- ace(x=x.LSTAT, y=y.MEDV) # Plot and compare the transformations: par(mfcol=c(2,3), mar=c(3,3,1,1), mgp=c(1.5,.5,0)) # ACE transforms: plot(ace.MEDV.LSTAT$x, ace.MEDV.LSTAT$tx, pch=16, cex=.5) plot(ace.MEDV.LSTAT$y, ace.MEDV.LSTAT$ty, pch=16, cex=.5) # Our polynomial transforms: plot(x.LSTAT, x.LSTAT.mon %*% cca.MEDV.LSTAT.mon$xcoef[,1], pch=16, cex=.5) plot(y.MEDV, y.MEDV.mon %*% cca.MEDV.LSTAT.mon$ycoef[,1], pch=16, cex=.5) # Our B-spline transforms: plot(x.LSTAT, - x.LSTAT.bsp %*% cca.MEDV.LSTAT.bsp$xcoef[,1], pch=16, cex=.5) plot(y.MEDV, - y.MEDV.bsp %*% cca.MEDV.LSTAT.bsp$ycoef[,1], pch=16, cex=.5) Questions: + Are these transformations trying tell us something? + Which of the three approaches do we trust most? ---------------- * TRANSFORMATIONAL MA 1: ACE AS A CCA PROBLEM - The ACE problem is also a maximal correlation problem, but the x-block will be permitted to consist of bases of several predictor variables. - R: Code for bases-based smoothing inside ACE; then run CCA We need to write general code that implements the multi-bases x-block. ace.basis <- function(x, y, basis.generator, o=1) { # o = desired orders of eigen transforms y.basis <- basis.generator(y) d <- ncol(y.basis) colnames(y.basis) <- paste("y.",seq(ncol(y.basis)),sep="") x.bases <- lapply(colnames(x), function(j) basis.generator(x[,j])) for(j in seq(x.bases)) { colnames(x.bases[[j]]) <- paste(colnames(x)[j],seq(ncol(x.bases[[j]])),sep=".") } x.bases <- do.call(cbind, x.bases) cc <- cca(x.bases, y.basis) # This is the actual computation; all else is (un)packaging. ty <- y.basis %*% cc$ycoef[,o] tx <- sapply(unique(sub("[.].+","",colnames(x.bases))), function(nam) { nams <- grep(nam,colnames(x.bases)); print(nams); print(nam) x.bases[,nams,drop=F] %*% cc$xcoef[nams,o,drop=F] } ) if(cor(y, ty) < 0) { ty <- -ty; tx <- -tx } tmp <- list(x=x, y=y, tx=tx, ty=ty, rsq=cc$cor[o]^2) class(tmp) <- "ace" tmp } # Polynomial basis: basis.generator.poly <- function(x, nn=3) { x.qr <- qr(sapply(1:nn, function(j) x^j)) scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) } # B-spline basis: basis.generator.bsp <- function(x, nn=2) { library(splines) x.qr <- qr(bs(x, knots=setdiff(unique(quantile(x, prob=(1:nn)/((nn+1)))), range(x)), Boundary.knots=range(x), degree=3, intercept=F)) scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) } # Original Breiman-Friedman ACE algorithm packaged: install.packages("acepack", repos=repos) # Install first if you don't have it. ace.orig <- function(...) { library(acepack) tmp <- ace(...) tmp$x <- t(tmp$x) # screwy detail in ace() class(tmp) <- "ace" tmp } # Method for generic function 'plot' to dispatch on class 'ace': plot.ace <- function(ace.str, mfrow=c(3,5)) { par(mfrow=mfrow, mgp=c(1.5,.5,0), mar=c(3,2,.5,.5)) rg <- range(c(ace.str$tx, ace.str$ty)) for(j in 1:ncol(ace.str$x)) { plot(ace.str$x[,j], ace.str$tx[,j], xlab=colnames(ace.str$x)[j], ylab="", pch=16, ylim=rg) text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1), lab=paste("sd =",round(sd(ace.str$tx[,j]),3)) ) } plot(ace.str$y, ace.str$ty, xlab="y", ylab="", pch=16, ylim=rg) text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1), lab=paste("sd =",round(sd(ace.str$ty),3)) ) plot(apply(ace.str$tx,1,sum), ace.str$ty, xlab="tx", ylab="ty", pch=16, xlim=rg, ylim=rg) text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1), lab=paste("rsq =",round(ace.str$rsq,3)) ) } # Original and our version of ACE: boston.ace.orig <- ace.orig( boston[,-14], boston[,14]) # original ACE boston.ace.poly <- ace.basis(boston[,-14], boston[,14], basis.generator.poly) # polynomial boston.ace.bsp <- ace.basis(boston[,-14], boston[,14], basis.generator.bsp) # B-spline c(R2=boston.ace.orig$rsq, R=sqrt(boston.ace.orig$rsq)) c(R2=boston.ace.poly$rsq, R=sqrt(boston.ace.poly$rsq)) c(R2=boston.ace.bsp$rsq, R=sqrt(boston.ace.bsp$rsq)) windows(h=9,w=15) plot(boston.ace.orig) plot(boston.ace.poly) plot(boston.ace.bsp) - Null behaviors: boston.null <- boston boston.null[,"MEDV"] <- sample(boston.null[,"MEDV"]) # Scramble the response boston.ace.null.orig <- ace.orig( boston.null[,-14], boston.null[,14]) # original ACE boston.ace.null.poly <- ace.basis(boston.null[,-14], boston.null[,14], basis.generator.poly) # polynomial boston.ace.null.bsp <- ace.basis(boston.null[,-14], boston.null[,14], basis.generator.bsp) # B-spline boston.ace.null.orig$rsq; sqrt(boston.ace.null.orig$rsq) boston.ace.null.poly$rsq; sqrt(boston.ace.null.poly$rsq) boston.ace.null.bsp$rsq; sqrt(boston.ace.null.bsp$rsq) windows(); plot(boston.ace.null.orig) windows(); plot(boston.ace.null.poly) windows(); plot(boston.ace.null.bsp) Comparison: . g(Y): + Original ACE manages to create a roughly linear g(Y), + polynomial and B-Splinen ACE don''t. . Optimal correlations: + ACE overfits the least: max corr ~ .24) + Poly and B-Splines overfit: max corr ~ 0.38 and 0.42, resp. . Reasons for overfitting: Too many dfs in the additive model + dfs(poly) = 37 (= 13*3 - 2) + dfs(B-splines) = 59 (= 13*5 - 2 - 4) - A universal alternating algorithm for ACE: ace.alt <- function(x, y, smoother, niter=30) { x <- scale(x); y <- scale(y) # Initialize with linear model through origin: xy.lm <- lm(y ~ .-1, data=as.data.frame(x)) tx <- t(t(x)*coef(xy.lm)) # s(xj) = bj * xj ty <- y # initialize with identity transform res <- resid(xy.lm) # initialize ty-residuals for(iter in seq(niter)) { # tx updates (in random order, instructor's arbitrary decision): for(j in sample(seq(ncol(x)))) { res <- res + tx[,j] tx[,j] <- smoother(x=x[,j], y=res) res <- res - tx[,j] } # ty updates: res <- ty - res ty <- scale(smoother(x=y, y=res)) res <- ty - res } tmp <- list(x=x, y=y, tx=tx, ty=ty, rsq=cor(apply(tx, 1, sum), y)^2 ) class(tmp) <- "ace" tmp } # Wrapper for polynomial regression: smoother.poly <- function(x, y) { fitted(lm(y ~ x + I(x^2) + I(x^3))) } # Wrapper for smoothing spline: smoother.smspl <- function(x, y) { xx <- round(x,5) # bundle values if(length(unique(xx)) <=10) { ty <- fitted(lm(y ~ factor(xx))) } else { sm <- smooth.spline(x=xx, y=y, df= 5) ty <- approx(sm, xout=xx)$y } ty } # Wrapper for lowess(): it orders according to x smoother.low <- function(x, y) { xx <- x+runif(length(x),-sd(x)/100,+sd(x)/100) ord <- order(xx) ty <- lowess(xx[ord], y[ord])$y ty[ord] <- ty # Undo the ordering according to x ty } # Wrapper for supsmu: uniques and sorts the x values ==> jitter them randomly and unsort smoother.sup <- function(x, y) { xx <- x+runif(length(x),-sd(x)/100,+sd(x)/100) ord <- order(xx) ty <- supsmu(xx[ord], y[ord])$y ty[ord] <- ty # Undo the ordering according to x ty } # Run the next two lines with any of the above three smoothers: boston.ace.poly <- ace.alt(boston[,-14], boston[,14], smoother.poly) boston.ace.smspl <- ace.alt(boston[,-14], boston[,14], smoother.smspl) boston.ace.low <- ace.alt(boston[,-14], boston[,14], smoother.low) boston.ace.sup <- ace.alt(boston[,-14], boston[,14], smoother.sup) boston.ace.poly$rsq boston.ace.smspl$rsq boston.ace.low$rsq boston.ace.sup$rsq plot(boston.ace.poly) # RM and end dips not believable plot(boston.ace.smspl) # Too much believable plot(boston.ace.low) # Nice! plot(boston.ace.sup) # NOX not quite believable # There may be slight differences between runs due to random order of updates # and random jitter for supsmu(). - Question: Under what conditions does the above alternating algorithm converge? . This question will have a satisfying answer only for the linear versions of some of the smoother classes. Nonlinearity due to cross-validated bandwidth choice makes the convergence question most likely impenetrable. . BASIS EXPANSION METHODS: easy... The regularization is just subspace restriction. + Let g, f1, ..., fp be N vectors of transforms g(Y), g1(X1), ..., fp(Xp) evaluated at the observed values of these variables. + These vectors will be members of subspaces V.y, V.x1, ..., V.xp (subsets of R^N). + Because the sum f1(X1)+...+fp(Xp) represents an additive model, the subspace restriction for f1+...+fp is V.x = V.x1+...+V.xp (= span of V.x1, ..., V.xp). + Therefore, the alternating algorithm is just LS projection back and forth between V.y and V.x. + Let H.x and H.y be the orthogonal projectors onto V.y and V.x, resp. + Consider convergence of g: o Initialize: g.0 o Iterate: g.n = H.y H.x g.{n-1} and normalize g.n = g.n/|g.n| + Does this algorithm converge? Wishful thinking: If H.y H.x only were symmetric and non-negative definite ... + Actually, it is, as a map V.y --> V.y, but NOT as a map R^N --> R^N !! Proof: Let g1 and g2 be elements in V.y. = because H.y is symmetric on R^N = because g2 is in V.y = because H.x is symmetric on R^N = because g1 is in V.y = because H.y is symmetric on R^N QED: H.y H.x is symmetric on the subspace V.y. Let g be an element of V.y: = |H.y H.x g|^2 >= 0 QED: H.y H.x is non-negative definite. + Thus the alternating projections amount to a power algorithm, which we know to converge to the largest eigenvector. (Remind yourself of the proof!) . QUADRATIC PENALTY METHODS: needs a little more machinery, but works, too... + First: The same argument as for projections does not work if S.x and S.y are of the smoothing spline/generalized Ridge type. + Preliminaries for additive models with quadratic penalties: PLS = |y - (f1+...+fp)|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp Rewrite this with o a 'design matrix' X = (I_N,...,I_N) of size N x (Np) o additive summands stacked: f=(f1^T,...,fp^T)^T of size (Np) x 1 o so that f1+...+fp = X f of size N x 1 o and a super penalty matrix Omega = diag(Omega1,...,Omegap) of (Np) x (Np) o so that f1^T Omega1 f1 + ... + fp^T Omegap fp = f^T Omega f o Finally: PLS = |y - X f|^2 + f^T Omega f o Normal equations: - X^T y + X^T X f + Omega f = 0 o Hat/additive fitting matrix: f = (X^T X + Omega)^{-1} X^T y = H.x y [Informal HW: Write out what X^T X + Omega really is ...] [This is not how you would actually compute an additive fit with smoothing splines, but it gives us a way to think about it. In particular we see that y |-> f=c(f1,...,fp) is just another linear operation, assuming fixed penalties.] + Back to the alternating algorithm: o We can consider a general setup where both blocks are additive models: Criterion.obs = |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f o What is the natural constraint on (f,g)? Expand the quadratic Criterion and set the cross-term between f and g to zero: Criterion.null = |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f Crossterm: -2; the penalties contribute nothing o Rayleigh quotient: |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f (*) ------------------------------------------------- = min/max/stat |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f o Stationary (eigen) equations: (- X^T Y g + X^T X f + Omega.x f)*... - ...*(X^T X f + Omega.x f) = 0 ==> f ~ (X^T X + Omega.x)^{-1} X^T Y g = S.x Y g ==> g ~ (Y^T Y + Omega.y)^{-1} Y^T X f = S.y X f (for symmetry) This has indeed the form of the alternating algorithm. Because the algorithm is the power algorithm of an eigen problem, it converges. + Equivalent criteria: o Asymmetric constraint for CCA to mascarade as regression: |Y g - X f|^2 + g^T Omega.y g + f^T Omega.x f (**) --------------------------------------------- = min/stat |Y g|^2 + g^T Omega.y g o Penalized correlation criterion: ^2 (***) ----------------------------------------------------- = max/stat (|Y g|^2 + g^T Omega.y g) * (|X f|^2 + f^T Omega.x f) o The stationary solutions (f,g) of criteria (*), (**) and (***) are the same up to positive scalars. + Alternating algorithm: o Power algorithm applied to S.x S.y and S.y S.x: f --> S.x S.y f g --> S.y S.x g o Is S.x S.y (say) symmetric ('self-adjoint') in some sense? ==> We need an inner product! Q: What is the natural inner product on (f,g)-space? A: |||(f,g)|||^2 := Rayleigh denominator = |Y g|^2 + |X f|^2 + g^T Omega.y g + f^T Omega.x f o Associated with a pos.-def. quadratic form is an inner product: <<(f1,g1), (f2,g2)>> = g1^T Y^T Y g2 + f1^T X^T X f2 + g1^T Omega.y g2 + f1^T Omega.x f2 o Restricted to f-space this is: <> = f1^T X^T X f2 + f1^T Omega.x f2 o Is the mapping f --> S.x S.y f symmetric in this inner product? ... . LOCALLY KERNEL WEIGHTED POLYNOMIAL SMOOTHING: nothing is known here because this type of smoothing does not solve an optimization problem. The non-symmetric nature of the smoothing matrix suggests that occasional slight non-convergence due to 'Jordan cycling' might occur. This, however, has never been a practical problem. Even the use of locally cross-validating supsmu seems to result in convergence in practice. Asymptotically for N-->Inf, there is of course convergence because all properly bandwidthed smoother converge to conditional expectation operators, which are orthogonal projections in Hilbert space, hence the arguments from 'BASIS EXPANSION METHODS' apply, modulo infinite dimensional regularity conditions (E[] is best assume to be a compact operator). ---------------------------------------------------------------- * TRANSFORMATIONAL MA 2: ADDITIVE IMPLICIT EQUATIONS FROM SMALLEST ADDITIVE PRINCIPAL COMPONENTS (APC) - The problem: . Look for additive implicit equations of the form f1(X1) + ... + fp(Xp) ~ 0 . Can be used as a 'concurvity' diagnostic for additive regression: Y ~ h1(X1) + ... + hp(Xp) 0 ~ f1(X1) + ... + fp(Xp) creates identifiability problems for the regression. . Has independent value as a symmetric analysis of variables. - Criterion: V[ f1(X1) + ... + fp(Xp) ] = min_{f1,...,fp} subject to ... ? Recall the "null principle" for deriving MA constraints: -------------------------------------------- | Null assumption of uncorrelated blocks | -------------------------------------------- This results in the following constraint: V[f1(X1)] + ... + V[fp(Xp)] = 1 Rayleigh: V[ f1(X1) + ... + fp(Xp) ] --------------------------- = min_{f1,...,fp} V[f1(X1)] + ... + V[fp(Xp)] If the minimum is near zero, we will have found an approximate implicit equation for the data. - We know this must result in an eigen problem: . Subspace projection smoothers: Expand all variables in bases, so fj(Xj) = Xbas.j aj ==> GCA problem | Xbas.1 a1 + ... + Xbas.p ap |^2 --------------------------------------- = min_{a1,...,ap} | Xbas.1 a1 |^2 + ... + | Xbas.p ap |^2 . Application: + Use the basis generators from earlier: basis.generator.poly <- function(x, nn=3) { # x.qr <- qr(sapply(1:nn, function(j) x^j)) # scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) } scale(poly(x, min(nn,length(unique(x))-1))) } basis.generator.bsp <- function(x, nn=2) { library(splines) x.qr <- qr(bs(x, knots=setdiff(unique(quantile(x, prob=(1:nn)/((nn+1)))), range(x)), Boundary.knots=range(x), degree=3, intercept=F)) scale(qr.Q(x.qr)[,seq(x.qr$rank),drop=F]) } + Apply gca: apc.bases <- function(X, basis.generator, ord=1, minimize=T) { Xs <- scale(X) X.bases <- lapply(seq(ncol(X)), function(j) basis.generator(Xs[,j])) names(X.bases) <- colnames(X) X.gca <- gca(X.bases) # The actual computation if(minimize) ord <- ncol(X.gca$vectors)+1-ord coefs <- tapply(X.gca$vectors[,ord], names(X.gca$vectors[,ord]), function(x) x) # print(coefs) Xt <- sapply(seq(ncol(X)), function(j) { sel <- grep(paste("[(]",j,"[)]",sep=""), names(X.gca$vectors[,ord])) X.bases[[j]] %*% X.gca$vectors[sel,ord] } ) colnames(Xt) <- colnames(X) tmp <- list(X=X, Xt=Xt, eval=X.gca$values[ord], all.evals=X.gca$values, ord=ord) class(tmp) <- "apc" tmp } plot.apc <- function(apc, jitter=F, fac=1) { par(mfrow=c(3,5), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) ylim <- range(apc$Xt) for(j in seq(ncol(apc$X))) { if(jitter) { x <- jitter(apc$X[,j], fac); y <- jitter(apc$Xt[,j], fac) } else { x <- apc$X[,j]; y <- apc$Xt[,j] } plot(x, y, type="n", ylim=ylim, xlab=colnames(apc$X)[j], ylab="") abline(h=0, col="gray") points(x, y, pch=16, cex=.8) text(x=par("usr")[1], y=par("usr")[4], adj=c(-.1,1), lab=paste("sd =",round(sd(apc$Xt[,j]),3)) ) } } + Polynomial bases example: boston.apc.poly <- apc.bases(boston, basis.generator.poly, ord=1) boston.apc.poly <- apc.bases(boston, basis.generator.poly, ord=2) plot(boston.apc.poly); round(boston.apc.poly$all.evals,3); round(boston.apc.poly$eval,3) + B-spline bases example: boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=1) boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=2) boston.apc.bsp <- apc.bases(boston, basis.generator.bsp, ord=3) plot(boston.apc.bsp); boston.apc.bsp$all.evals; boston.apc.bsp$eval; + Comments: o B-splines (ord=1) generate a meaningless solution (constants) with zero eigenvalue o Polynomials (ord=1) and B-splines (ord=2) generate a solution involving INDUS, RAD, TAX, like in STAT 541 with linear smallest PCA o Polynomials (ord=2) and B-splines (ord=3) generate an interesting solution involving many variables, including CRIME, NOX and MEDV - APC with quadratic penalties: . Rayleigh quotient: |f1 + ... + fp|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp ----------------------------------------------------------------- = min_{f1,...,fp} |f1|^2 + ... + |fp|^2 + f1^T Omega1 f1 + ... + fp^T Omegap fp . Stationary/normal equations: gradient wrt fj [(f1 + ... + fp) + Omegaj fj] * ... - ... * [fj + Omegaj fj] = 0 [(f1 + ... + fp) - fj] ~ (I + Omegaj) fj . Suggested iterative algorithm: fj <- (I+Omegaj)^{-1} [(f1 + ... + fp) - fj] (for all j=1...,p) = Sj [(f1 + ... + fp) - fj] ==> Power algorithm with a big matrix (Np)x(pN) matrix S.super. Problem for you: Write down S.Super . Unfortunately these iterations converge to the max of the Rayleigh quotient, not the minimum. Q: How can we find the minimum eval? A: Flip the eigenvalue profile (spectrum) If alpha >= max evals of S.super, then the smallest eigenvalue of S.super corresponds to the largest eigenvalue of alpha*I - S.super ==> Power algorithm applied to alpha*I - S.super [It helps to have a good bound on max(evals(S.super)))] - APC iterations: We generalize the suggested iterations to work with arbitrary smoothers. apc.iter <- function(X, smoother, niter=50, minimize=T) { Xs <- scale(X) Xt <- Xs p <- ncol(X) N <- nrow(X) if(minimize) { alpha <- (p+1)/2 } else { alpha <- 0 } for(iter in 1:niter) { Xsum <- rowSums(Xt) Xt <- sapply(1:p, function(j) alpha*Xt[,j] - smoother(Xs[,j], Xsum-Xt[,j]) ) # s <- sd(c(Xt)); if(s<1E-16) cat(" sd=0\n") Xt <- Xt / (s*sqrt(N)) num.na <- sum(is.na(Xt)); num.inf <- sum(abs(Xt)==Inf) if(num.na>0 | num.inf>0) cat("#NA =",num.na," #Inf =",num.inf,"\n") } Xt <- scale(Xt, center=T, scale=F) colnames(Xt) <- colnames(X) tmp <- list(X=X, Xt=Xt, eval=sum(rowSums(Xt)^2)/sum(Xt^2)) class(tmp) <- "apc" tmp } # Wrapper for polynomial regression: smoother.poly <- function(x, y) { fitted(lm(y ~ x + I(x^2) + I(x^3))) } # Wrapper for smoothing spline: smoother.smspl <- function(x, y) { x <- x+runif(length(x),-sd(x)/100,+sd(x)/100) ord <- order(x) ty <- smooth.spline(x=x[ord], y=y[ord], df=4, tol=diff(range(x))*1E-04)$y ty[ord] <- ty ty } # Wrapper for lowess(): it orders according to x smoother.low <- function(x, y) { x <- x+runif(length(x),-sd(x)/100,+sd(x)/100) ord <- order(x) ty <- lowess(x[ord], y[ord])$y ty[ord] <- ty # Undo the ordering according to x ty } # Wrapper for supsmu: uniques and sorts the x values ==> jitter them randomly and unsort smoother.sup <- function(x, y) { x <- x+runif(length(x),-sd(x)/100,+sd(x)/100) ord <- order(x) ty <- supsmu(x[ord], y[ord])$y ty[ord] <- ty # Undo the ordering accordingn to x ty } - Examples: boston.apc.iter.poly <- apc.iter(boston, smoother.poly) windows(w=8, h=6); plot(boston.apc.iter.poly); boston.apc.iter.poly$eval plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="Poly") boston.apc.iter.smspl <- apc.iter(boston, smoother.smspl) windows(w=8, h=6); plot(boston.apc.iter.smspl); boston.apc.iter.smspl$eval plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="SmSpline") boston.apc.iter.low <- apc.iter(boston, smoother.low) windows(w=8, h=6); plot(boston.apc.iter.low); boston.apc.iter.low$eval plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="Lowess") boston.apc.iter.sup <- apc.iter(boston, smoother.sup) windows(w=8, h=6); plot(boston.apc.iter.sup); boston.apc.iter.sup$eval plot(0:1,0:1, type="n", xaxt="n", yaxt="n", xlab="", ylab="", main="SupSmu") We see here what corresponds to ord=2 and ord=3 for poly and B-spline bases solutions. ---------------------------------------------------------------- * TRANSFORMATIONAL MA 3: INDEPENDENT COMPONENT ANALYSIS (ICA) AND SMALLEST ADDITIVE PRINCIPAL COMPONENTS (APC) - Externalities: . Bach and Jordan (2002) proposed 'kernel ICA' based on APCs. - What is ICA? . It is the search for 'INDEPENDENT linear sources'. . 'Linear source' = linear combs. in multivariate language = latent variable linearly related to observables (but then estimated as a lin.comb. ...) . 'Independent' = stochasticaly independent, not just uncorrelated . The "model" is in random vector language: Xvec = A Yvec where Yvec[1],...,Yvec[p] are unknown sources that are INDEPENDENTLY but generally NOT IDENTICALLY distributed. The fixed parameter matrix A (pxp) must be non-singular. . One should assume ICAs do not generally exist, but still one can try to find out how close to independent one can find p linear combinations to be. - Simplifying pre-processing: sphering !!! . Wlog we can assume that the components of Yvec are standardized: V[Yvec] = I_p The components of Yvec must be uncorrelated: Independent ==> uncorrelated. . Wlog we can also assume the p-variate random vector is sphered: V[Xvec] = I_p (Well, empirical sphering is really only an estimate of population sphering, but that''s ok.) . Q: If Xvec and Yvec are sphered, then this implies what for the columns of A? A: ... Reason: I = V[Xvec] = V[A Yvec] = A V[Yvec] A^T = A A^T ------------------- ==> | A is orthogonal | ------------------- ==> ICA searches for an o.n. basis or rotation U of R^p such that the transformed Yvec = Xvec U has as independent columns as possible. - Bach&Jordan''s proposal: . Use this fact: ----------------------------------------------- | Yvec has pairwise independent components | | <==> | | Cor[(f1(Yvec.1),...,fp(Yvec.p))] = I_p | | FOR ALL (nonlinear) fj. | ----------------------------------------------- . Note: Pairwise independence is not the same as full independence!!! But: Full independence ==> Pairwise independence Hence: B&J are only looking for a necessary but not sufficient condition for ICA . Connection with population APCs: Cor[(f1(Yvec.1),...,fp(Yvec.p))] = I_p for all fj(Yvec.j) <==> min_{f1,...,fp} min.eval[Cor[(f1(Yvec.1),...,fp(Yvec.p))]] = 1 <==> max_{f1,...,fp} max.eval[Cor[(f1(Yvec.1),...,fp(Yvec.p))]] = 1 <==> max_{f1,...,fp} det[Cor[(f1(Yvec.1),...,fp(Yvec.p))]] = 1 The first two criteria lend themselves for APCs. Bach&Jordan use the third criterion. . Proposal for ICA: solve one of the following problems (1) max_{U in SO(p)} min_{f1,...,fp} min.eval[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]] (2) min_{U in SO(p)} max_{f1,...,fp} max.eval[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]] (3) max_{U in SO(p)} min_{f1,...,fp} det[Cor[(f1((U Xvec).1),...,fp(U Xvec).p)]] Bach&Jordan: They minimize -log[det[...]], hence maximize det[...] wrt U. Hence they choose proposal (3). . [Why does (3) work? Reason: det = product of evals, but sum(evals) = p For any p numbers a1,...,ap>0 we have: max_{a1+...+ap=p, aj>=0} a1*...*ap == 1 <==> aj=1 for all j The product function is concave and permutation symmetric, hence its max is attained at the center of the symplex {a1+...+ap=p, aj>=0}. ] . For all proposals the closeness of the actual criterion value to +1 at the optimum indicates how well we have found Independent Components. - A crude R implementation: . Make up some artificial data: N <- 2000 Y <- scale(cbind("Norm"=rnorm(N), "t5"=rt(N,df=5), "Chi7"=rchisq(N,df=7), "Exp"=rexp(N), "Unif"=runif(N))) p <- ncol(Y) rg <- range(c(Y)) pairs(Y, pch=16, cex=.5, xlim=rg, ylim=rg, gap=0) . Set the true A=I, so X=Y, but start with a random rotation A, so the algorithm should find back to A=I. X <- Y . Use (1) or (2) above as the criterion function: ica.crit <- function(A) { # Orthonormalize the argument: A.o.n. <- qr.Q(qr(matrix(A, nrow=sqrt(length(A))))) X <- Y %*% A.o.n. # Some alternatives as APC building blocks: uncomment one at a time apc.bases(X, basis.generator.poly, minimize=F)$eval # apc.bases(X, basis.generator.bsp, minimize=F)$eval } # Initalize: ica.init <- qr.Q(qr(matrix(rnorm(p^2), ncol=p))) # Test the function: ica.crit(ica.init) # This function 'thinks' a long time: ica.nlm <- nlm(ica.crit, ica.init) # Local minimum value: c(init=ica.crit(ica.init), converged=ica.nlm$minimum) # Local minimum solution: round(matrix(ica.nlm$estimate, ncol=p), 2) # raw round(qr.Q(qr(matrix(ica.nlm$estimate, ncol=p))), 2) # o.n. # Some directions are found, but others not. - Summary: Not sure that ICA is a viable technique. Strongly depends on tail behavior of distributions. ==> Expect extreme non-robustness. File it away as an interesting concept. ---------------------------------------------------------------- * FUNCTIONAL MULTIVARIATE ANALYSIS (just briefly) - What makes a data analysis situation 'functional'? . We''re estimating discretized functions/signals/processes on a domain. . Typical domains: time, space, frequency . We only consider the case where all cases are sampled at the same locations in the domain. Examples: daily stock returns (case=company) mid-day temperatures at 200 locations across the US - What makes data two-way functional? A: In the data table both rows and columns have underlying domains. Ex.: Daily mid-day temperatures at 200 locations for 20 years rows: locations, cols: days - Multiple time series example: montly average temperatures in Philly 1950-199 Organized as multiple time series: 50 12-month series, or 12 50-yr series #X <- as.matrix(read.table("c:/D/DATA/TemperaturesAndPrecipitations/Philly-temp.txt")[,4:15]) X <- as.matrix(read.table("http://stat.wharton.upenn.edu/~buja/STAT-926/Philly-temp.txt")[,4:15]) str(X) X.svd <- svd(X - mean(X)) par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:4) plot(-X.svd$v[,j], pch=16, type="b") for(j in 1:4) plot(-X.svd$u[,j], pch=16, type="b") X.svd$d (Idiosyncratic remarks about this dataset: In light of the - One would want the yearly u-vectors to be smoother. Start with the svd as a low-rank approximation problem: | X - d u v^T |^2 (Frobenius matrix norm) - Aproach 1: Subspace restriction on u . Algebra: u = B a, where B is an o.n. basis: B^T B = I_d, hence |B a|^2 = |a|^2 hence |a|^2 = 1 due to |u|^2 = 1 Let H = B B^T be the hat matrix for the projection onto span(B). Then: | X - d u v^T |^2 = | X - d (Ba) v^T |^2 = | X - d B a v^T |^2 = | (I-H)X + HX - d B a v^T |^2 = | (I-H)X |^2 + | HX - d B a v^T |^2 ~ const + | B B^T X - d B a v^T |^2 ~ const + | B^T X - d a v^T |^2 ==> Projection of the X columns onto B, run an svd on the projection, reconstitute the original. . R code: # Orthogonal polynomials: B <- cbind(1,poly(1:nrow(X), degree=5)) # B-spline alternative: library(splines) B <- qr.Q(qr(bs(1:nrow(X), df=5))) # Common code: X.B <- crossprod(B, X-mean(X)) X.B.svd <- svd(X.B) X.B.svd.u <- B %*% X.B.svd$u par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:4) plot(-X.B.svd$v[,j], pch=16, type="b") for(j in 1:4) plot(-X.B.svd.u[,j], pch=16, type="b") X.B.svd$d Unsatisfactory, both with polynomials and with B-splines. - Approach 2: Quadratic penalty approach -- one way . Algebra: | X - d u v^T | + |D u|^2 | X - d u v^T | + u^T Omega u where Omega = D^T D ==> Does not work, or is conceptually odd. We are penalizing the smoothness of unit vector... Proposal: Absorb d in u and/or v and look for a penalty that is invariant under rescaling u -> cu, v -> v/c. | X - u v^T | + u^T Omega u * |v|^2 = | X |^2 - 2 tr[ X^T u v^T ] + |u|^2 |v|^2 + u^T Omega u |v|^2 = const - 2 tr[ X^T u v^T ] + [ u^T (I + Omega) u ] |v|^2 ~ - 2 tr[ X^T u v^T ] + [ u^T (I + Omega) u ] |v|^2 Define and substitute: S = (I + Omega)^{-1} = smoother matrix S.1/2 = (I + Omega)^{-1/2} = half-smoother matrix uu = (I + Omega)^{1/2} u u = S.1/2 uu = - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2 ~ | S.1/2 X |^2 - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2 = | S.1/2 X - uu v^T |^2 Recipe: Half-smooth the columns of X. Run an svd on the result, obtain uu and v. Half-smooth uu: u = S.1/2 uu . R code: diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] } diff.2nd.mat(6) # Toy example N <- nrow(X) D <- diff.2nd.mat(N) Omega <- crossprod(D) Omega.eig <- eigen(Omega) lambda <- 10 # then try 1, then 10, then 100, then 1000... S <- Omega.eig$vec %*% diag(1/(1 + lambda*Omega.eig$val)) %*% t(Omega.eig$vec) S.half <- Omega.eig$vec %*% diag(1/sqrt(1 + lambda*Omega.eig$val)) %*% t(Omega.eig$vec) X.half <- S.half %*% (X-mean(X)) X.half.svd <- svd(X.half) X.half.svd.u <- S.half %*% X.half.svd$u par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:4) plot(-X.half.svd$v[,j], pch=16, type="b") for(j in 1:4) plot(-X.half.svd.u[,j], pch=16, type="b") X.half.svd$d - Approach 2: Quadratic penalty approach -- two way What if we also want to smoothe the second domain (the other dimension)? What two-way penalty would you propose? | X - d u v^T | + u^T Omega.u u + v^T Omega.v v ==> Same violation of scale invariance Next try: (*) | X - u v^T | + u^T Omega.u u * |v|^2 + |u|^2 * v^T Omega.v v This doesn''t lead to the obvious procedure either: We expect that the proper penalty results in two-way half-smoothing. Criterion (*) doesn''t. Here is what we need: (**)| X - u v^T | + u^T Omega.u u * |v|^2 + |u|^2 * v^T Omega.v v + u^T Omega.u u * v^T Omega.v v If you do the math, you will find that the two-way substitution with half-smoothing results in the solution of this criterion: S.u = (I + Omega.u)^{-1} = smoother matrix for u S.v = (I + Omega.v)^{-1} = smoother matrix for v S.u.1/2 = (I + Omega.u)^{-1/2} = half-smoother matrix for u S.v.1/2 = (I + Omega.v)^{-1/2} = half-smoother matrix for v uu = (I + Omega.u)^{1/2} u u = S.u.1/2 uu vv = (I + Omega.v)^{1/2} v v = S.v.1/2 v (**) = - 2 tr[ X^T S.1/2 uu v^T ] + |uu|^2 |v|^2 Recipe: Half-smooth the columns of X. Half-smooth the rows of X. Run an svd on the result, obtain uu and vv. Half-smooth uu and half-smooth vv. Done R code: diff.2nd.mat <- function(N) { diff.1st.mat(N)[-(N-1),] - diff.1st.mat(N)[-1,] } diff.2nd.mat(6) # Toy example N <- nrow(X) p <- ncol(X) D.u <- diff.2nd.mat(N) D.v <- diff.2nd.mat(p) Omega.u <- crossprod(D.u) Omega.v <- crossprod(D.v) Omega.u.eig <- eigen(Omega.u) Omega.v.eig <- eigen(Omega.v) lambda.u <- 10 # then try 1, then 10, then 100, then 1000... lambda.v <- 1000000 # then try 1, then 10, then 100, then 1000... S.u <- Omega.u.eig$vec %*% diag(1/(1 + lambda.u*Omega.u.eig$val)) %*% t(Omega.u.eig$vec) S.v <- Omega.v.eig$vec %*% diag(1/(1 + lambda.v*Omega.v.eig$val)) %*% t(Omega.v.eig$vec) S.u.half <- Omega.u.eig$vec %*% diag(1/sqrt(1 + lambda.u*Omega.u.eig$val)) %*% t(Omega.u.eig$vec) S.v.half <- Omega.v.eig$vec %*% diag(1/sqrt(1 + lambda.v*Omega.v.eig$val)) %*% t(Omega.v.eig$vec) X.half <- S.u.half %*% (X-mean(X)) %*% S.v.half X.half.svd <- svd(X.half) X.half.svd.u <- S.u.half %*% X.half.svd$u X.half.svd.v <- S.v.half %*% X.half.svd$v par(mfrow=c(2,4), mgp=c(1.5,.5,0), mar=c(3,3,1,1)) for(j in 1:4) plot(-X.half.svd.v[,j], pch=16, type="b") for(j in 1:4) plot(-X.half.svd.u[,j], pch=16, type="b") X.half.svd$d - Cyclic penalty for months: D.v <- diff.2nd.mat(p) rot <- function(x,n) c(x[-seq(n)],x[seq(n)]) D.v <- rbind(rot(D.v[1,],2), rot(D.v[1,],1), D.v) D.v # Looks right Now re-run the above code except the definition of D.v. - Note on the SVD approach: This is a mean model for the table X !!! Neither of the two 'dimensions' (months, years) is randomly sampled. So this breaks out of the mold of MVA where variance is decomposed. The SVD approach above is much more a form of latent mean model. One form of theory assumes X = U.r D.r V.r^T + Z E[X] = U.r D.r V.r^T where Z is a table of same size as X filled with iid N(0,sigma^2) and r is the 'true rank' of the table of expectated values. - Some obvious EDA we haven''t performed so far: . Plot row and column means of the data. plot(apply(X, 2, mean), type="b", pch=16, lwd=2) plot(apply(X, 1, mean), type="b", pch=16, lwd=2) Oh boy, this doesn''t quite look like global cooling at all... So where does the u-pattern come from in the SVD???? . Note that SVDs essentially estimate product interactions. That is, an SVD mean model decomposes the data table into a hierarchy of product interactions: X[i,j] ~ sum u[i]*v[j]. ==> Maybe it makes more sense to apply an SVD to the residuals from an additive fit, i.e., after removing row- and col-means. X.mean <- mean(X) X.row <- apply(X-X.mean,1,mean) X.col <- apply(X-X.mean,2,mean) X.res <- sweep(sweep(X, 1, X.row), 2, X.col) - X.mean X.res.svd <- svd(X.res) round(X.res.svd$d,5) # Sums of squares accounted for by the SVD product terms and the row/col effects: round(sum(X.col^2)*50,5) # SS due to Months round(sum(X.row^2)*12,5) # SS due to Years round(X.res.svd$d^2,5) # SS due to product interactions # ==> # Look at them: par(mfrow=c(2,4), mgp=c(1.5,0.5,0), mar=c(2.5,2.5,.5,.5)) plot(X.col, type="b", pch=16, xlab="Months", ylab="Temp") for(j in 1:3) plot(X.res.svd$v[,j], type="b", pch=16, xlab="Months", ylab="Temp") plot(X.row, type="b", pch=16, xlab="Years", ylab="Temp") for(j in 1:3) plot(X.res.svd$u[,j], type="b", pch=16, xlab="Years", ylab="Temp") These pictures are in sore need of smoothing, especially for the years. - Meta-observations: . In the two-way regularization problem we 'knew' what the 'obvious' algorithm was: Half-smooth both rows and columns of X, perform an SVD on the result, then half-smooth both the u- and v-vectors. . The problem was what criterion this algorithm minimizes. We managed to reverse-engineer the criterion from the algorithm. It seemed to make sense. . The minimization criterion can then be used for generalizations. Example: Propose one-way and two-way lasso-penalized SVDs! ... (squared lasso) ... ---------------------------------------------------------------- - FUNCTIONAL PCA: . So when do we have a functional PCA situation? o The temperature data considered so far are two-way functional, and there is no randomly sampled dimension to provide variance. o For multivariate analysis we need one index to represent random samples. Examples: + Body temperature time series on randomly sampled individuals + Stock return series on randomly sampled small businesses + Energy consumption time series of randomly sampled residences + DNA sequences on randomly sampled patients . Subspace fPCA: V[ Xvec^T a ] max/min/stationary ------------- subject to: a = B aa |a|^2 . Penalized fPCA: Once again you can use the null comparison principle. What is the numerator criterion under the assumption of absent correlation? V[ Xvec^T a ] + a^T Omega a max/min/stationary --------------------------- (*) |a|^2 + a ^T Omega a assuming the variables are standardized. Else: V[ Xvec^T a ] + a^T Omega max/min/stationary ------------------------- sum aj^2*V[Xj] + a ^T Omega a . If the null assumption includes equal variances, then (*) is the criterion without standardization. - Functional CCA: subspace restricted or penalized . Subspace fCCA V[ Xvec^T a + Yvec^T b ] max/min/stationary ------------------------- subject to: a = B.a aa, b = B.b bb V[Xvec^T a] + V[Yvec^T b] . Penalized fCCA: V[ Xvec^T a + Yvec^T b ] + a^T Omega.X a + b^T Omega.Y b max/min/stationary ------------------------------------------------------------- V[ Xvec^T a ] + V[ Yvec^T b ] + a^T Omega.X a + b^T Omega.Y b - Functional GCA: subspace restricted or penalized ... you get it... - Q: Which is more in need of regularization, PCA or CCA? A: ... - Curiosity: B.Silverman co-authored two approaches to fPCA, neither of which agrees with the above. Here they are: V[ Xvec^T a ] - a^T Omega a max/stationary --------------------------- (Rice & Silverman 1991) |a|^2 V[ Xvec^T a ] max/stationary ------------------- (Silverman 1994) |a|^2 + a^T Omega a Both approaches work only for the upper end of the PCA spectrum. ---------------------------------------------------------------- * Self-serving literature: - Buja, Hastie, Tibshirani (1989, AS, w discussion) "Linear Smoothers and Additive Models" - Buja (1990, AS) "Remarks on Functional Canonical Variates, Alternating Least Squares Methods and ACE" - Hastie, Tibshirani, Buja (1994, JASA) "Flexible Discriminant Analysis by Optimal Scoring" - Hastie, Buja, Tibshirani (1995, AS) "Penalized Discriminant Analysis" - Donnell, Buja, Stuetzle (1994, AS, with discussion) "Analysis of Additive Dependencies and Concurvities Using Smallest Additive Principal Components" - Tan, Buja, Ma (2014) "Regularized Additive Principal Components" - Huang, Shen, Buja (2009, JASA) "The Analysis of Two-Way Functional Data Using Two-Way Regularized Singular Value Decompositions" * Real literature: - Leurgans and Moyeed on penalized CCA fully our penalized CCA - Silverman (and Rice) on regularized PCA interesting: neither gets the penalizations 'right' - books by Ramsay and Silverman (different approach) and much more ... ================================================================ # # K-MEANS CLUSTER ANALYSIS # # The K-means clustering algorithm tries to divide the data # into a pre-specified number of groups (K) in such a way # that the groups are somehow homogeneous and separated # from each other if possible. # # The algorithm works by placing K tentative cluster centers # into data space. The algorithm then moves the centers # around till a certain criterion is minimized. # The criterion is the sum of squared distances to the # nearest cluster center: # # sum_{i=1..N} min_{j=1..K} || x_i - m_j ||^2 = min_{m_1,...,m_K} # # where m_1,..., m_K are the cluster centers in p-space # and x_1,..., x_N are the N data vectors/rows/cases/units/records. # Obviously, the k-means clustering algorithm assumes quantitative # variables. # About the algorithm: How are the centers m_j moved around? # Here some heuristics: Each data row x_i "belongs" to one of the m_j, # namely, the one that is nearest to x_i. # Now turn things around: m_j has a collection of x_i's that "belong" to it # because m_j is the nearest center. # Now hold the collection of these x_i's fixed and ask: where should m_j # be located to minimize the criterion? Answer: m_j should be the # mean of "its" x_i's. That's because we are using a sums of square criterion! # So let's move m_j to the mean of "its" x_i's, do it for all centers m_j. # Having done this, we should wonder whether the assignments of x_i's to # nearest m_j's might not have changed. In fact, in will have changed, # so we should do a re-assignement of the x_i's. # Now you see how the game repeats: # Given centers m_j, assign the x_i's to nearest m_j. # Replace the m_j's with the means of "their" x_i's. # Having new centers m_j, re-assign the x_i's to nearest m_j. # ... till convergence. # Is k-means a "clustering" algorithm? Maybe it is better to think # of it as a "partioning" algorithm. It partitions the data into # as many groups as you tell it to. Let's not ask the question # of the "correct number of clusters" yet. In fact, k-means is # often applied when there are no clusters at all. In marketing, # for example, clustering really means "segmentation", that is, # carving up into homogeneous segments, not natural clusters. # There may exist better methods for finding natural clusters # than k-means. # Example in R: function 'kmeans()', data: telecom marketing # We use a random selection of K=4 rows as initialization of # the cluster centers. dat <- scale(mktg[,-9]) N <- nrow(dat) p <- ncol(dat) K <- 4 # decide on 4 clusters/segments mktg.km <- kmeans(dat, iter.max=1000, centers = dat[sample(N, size=K),] ) # Note that the number of cluster is implicit in the dimensions of 'centers'; # no need for a special argument for K. # Examine the returned data structure: class(mktg.km) names(mktg.km) # The following lists the breakdown into mktg.km$size # The final centers are the following: mktg.km$centers # The assignment of the rows of 'mktg' to the clusters is in: mktg.km$cluster # But this grouping variable should be redundant with the size component: table(mktg.km$cluster); mktg.km$size # Indeed. # In what follows we will use a repackaged function that # picks K random rows for initialization. It is in source("http://www-stat.wharton.upenn.edu/~buja/STAT-541/segs.R") # source("functions/segs.R") # Instructor's local source # and its call is 'k.means(x, K=4)'. (Look at the content.) # It has functions also to align multiple clusterings # choosing labels as best matched as possible to each other. # # We'll look at the clusters in the first four principal component: mktg.pca <- pca(mktg[,-9]) # So here are the first 4 PCs colored according to windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) xlim <- ylim <- range(mktg.pca$scores) pairs(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg.km$cluster]) # our clustering:^^^^^^^^^^^^^^^ # Now this looks different from the original clustering/segmentation: windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) pairs(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,9]]) # the original clustering:^^^^^^^^ # Now let us compare a sequence of clusterings (we now omit the axes): par(mfrow=c(4,4), mar=rep(.5,4)) for(i in 1:16) { clust <- k.means(scale(mktg[,-9]), K=4)$cluster # using random restarts plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[clust]) } # There is a distraction in that the clusters are not matched in colors. # This can be helped: our 'k.means()' function can be given a clustering # vector, and the newly created solution will be immediately aligned # with the given clustering. This is done by giving 'k.means()' the # argument 'K=clustering.to.be.aligned.with'. In fact, the full name # of this argument is 'K.or.align'. If this argument is given, then # K is inferred from it. # # Here are the original clustering with 15 randomly initialized clusterings # aligned with the original clustering: plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[mktg[,9]]) # the initial segmentation for(i in 1:15) { # alignment below... clust <- k.means(dat, K.or.align=mktg[,9])$cluster # ^^^^^^^^ align colors with the original segs plot(mktg.pca$scores[,1:2], pch=16, cex=.3, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[clust]) } # So there seems to exist a degree of dependence on the initialization! # Comparing the 16 plots, however, it appears that there are several # clusterings that look identical. So maybe there are only a few discrete # local solutions for the kmeans criterion? # # If the k-means algorithm finds different solutions, how do they # perform in terms of the k-means criterion? Let's collect a # list of 1000 clusterings and analyze the criterion values: mktg.km.lst <- list() # takes a moment: <2sec/100 kmeans for(i in (length(mktg.km.lst)+1):1000) { if(i%%100==0) cat(i,"") mktg.km.lst[[i]] <- k.means(dat, K=mktg[,9]) } # (Among many calls to kmeans() it can happen that one of them crashes; # rerun, and it will continue from where it crashed. This is fail-safe code.) # # Peel off the criterion values: mktg.km.crit <- sapply(mktg.km.lst, function(x) sum(x$withinss)) # Look at them: windows(10,10) hist(mktg.km.crit, breaks=10000, col="gray", xlim=c(9100,9700)) # Oh, there are discrete sets of values that appear many times over!! # In fact, there are only few different values of the criterion: length(unique(mktg.km.crit)) ## [1] 29 # Let's collect them, sorted from smallest (best) to largest (worst): mktg.km.crit.uniq <- sort(unique(mktg.km.crit)) # and tabulate them: table(mktg.km.crit) 9240.92733734854 9313.91364263915 9313.91740353386 9313.97250010717 277 114 34 80 9314.0971039678 9319.76071481234 9319.8238068561 9319.91076889382 116 172 31 1 9336.89070097278 9336.90739225535 9336.9346563692 9370.29454375176 48 15 25 6 9370.4148403249 9377.17397780075 9377.31085261402 9449.83056805507 5 17 1 11 9450.00997410455 9450.41265146235 9487.3014103005 9487.32581863884 8 11 8 6 9488.15924588206 9491.312016306 9530.79434513218 9530.79668858917 1 2 5 4 9569.88679967423 9577.3159323069 1 1 # Turns out some of the values are nearly identical, such as the # 2nd, 3rd and 4th. # Could the values correspond to different geometric patterns of the # cluster centers? How could we confirm this hunch? Let's plot # in a 4x4 matrix layout again, but for one particular value # of the criterion at a time. Execute the following repeatedly, # but pick different values from the vector 'mktg.km.crit.uniq': icrit <- 1 # try for 1, 2, 3, ...; the values are sorted, best first! crit <- mktg.km.crit.uniq[icrit] windows() par(mfrow=c(4,4), mar=rep(.5,4)) xlim <- ylim <- range(mktg.pca$scores[,1:2]) for(i in 1:16) { clust <- mktg.km.lst[mktg.km.crit==crit][[i]]$cluster plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust], xaxt="n", yaxt="n", xlab="", ylab="") } text(x=par()$usr[2], y=par()$usr[3], lab=paste(icrit,":",crit), adj=c(1.1,-.2) ) # Interestingly, icrit=1, with crit = 9240.927, produces something # different from the segmentation that comes with the data in column 9. # A pattern like column 9 is found for icrit=2 with crit = 9313.914, # which is only second best! How come the analysts who produced # the segmentation of column 9 did not hit on the "best" clustering # in terms of the criterion? They probably did but preferred the # "second best" pattern anyway, most likely due to interpretability. # (In a moment we will see that the column 9 pattern is actually # worse: it belongs to the 5th largest criterion value...) # Next question: are the cluster assignments identical for all clusterings # with the same criterion value? So far the plots sure look like it. # In the following code we look at each criterion value at a time, # and for each value we check whether the cluster memberships are # identical. Realize that this exercise makes sense only because # we aligned the clusterings. Without alignment, identical clusterings # would differ by a random permutation of the cluster numberings. for(icrit in 1:length(mktg.km.crit.uniq)) { crit <- mktg.km.crit.uniq[icrit] # the unique sorted criterion values sel <- mktg.km.crit==crit # these are the solutions with criterion==crit if(sum(sel)>1) { # nothing to compare if only one clustering for this crit value clust1 <- mktg.km.lst[sel][[1]]$cluster # pick the first as reference for(i in 2:sum(sel)) # compare the others with the reference if(any(clust1 != mktg.km.lst[sel][[i]]$cluster)) # report only if different cat("differences found for",icrit,":", sum(clust1 != mktg.km.lst[sel][[i]]$cluster),"\n") }} # No complaint, all are identical clusterings in terms of case assignments! # Hence we can limit ourselves to one clustering per criterion value: mktg.km.lst.red <- list() for(icrit in 1:length(mktg.km.crit.uniq)) { crit <- mktg.km.crit.uniq[icrit] # the unique sorted criterion values sel <- mktg.km.crit==crit # these are the solutions with criterion==crit mktg.km.lst.red[[icrit]] <- mktg.km.lst[sel][[1]] # pick the first; all other are equal } # check: length 29 length(mktg.km.lst.red) # We lost the information about the number each clustering appeared in # the 1000 random restarts, but this is not essential for what follows. # From now on we're working with this reduced set that has redundant # clusterings removed. # Here is a plot of the first 16 of the 26 clusterings in the first 2 PCs: par(mfrow=c(4,4), mar=rep(.5,4)) xlim <- ylim <- range(mktg.pca$scores[,1:2]) for(i in 1:16) { crit <- mktg.km.crit.uniq[i] clust <- mktg.km.lst.red[[i]]$cluster plot(mktg.pca$scores[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xlab="", ylab="", xaxt="n", yaxt="n", col=c("red","green","blue","brown")[clust]) text(xlim[2], ylim[2], lab=paste("crit:",round(crit,4)), adj=c(1,1)) text(xlim[2], ylim[1], lab=paste("freq:", sum(mktg.km.crit==crit)), adj=c(1,0)) } # freq of this crit value in multiple restarts:^^^^^^^^^^^^^^^^^^ # Well, the best solution with criterion ~ 9241 is numerically a league better # than the others, but the 7 next patterns look very similar to each other # Let us now check which of the 28 clusterings has the same # partitions as the colum 9 segmentation by counting the # mismatches in the cluster numberings. Recall that this works # because mktg[,9] is the reference for alignment of the numberings: mktg.km.mis.9 <- sapply(mktg.km.lst.red, function(x) sum(x$cluster != mktg[,9]) ) names(mktg.km.mis.9) <- round(mktg.km.crit.uniq,4) # use the criterion values as names mktg.km.mis.9 # So, it is only the 5th that is identical to the column 9 clustering! # Clusterings 2, 3, 4 are close, though, expecially in view of N=1926. # Let us get a more complete picture by forming a table of mismatch counts # of all pairs of clusterings: mktg.km.mis <- matrix(0, length(mktg.km.crit.uniq), length(mktg.km.crit.uniq)) for(i in seq(mktg.km.crit.uniq)) for(j in seq(mktg.km.crit.uniq)) mktg.km.mis[i,j] <- sum(mktg.km.lst.red[[i]]$cluster != mktg.km.lst.red[[j]]$cluster) options(width=120) mktg.km.mis # We can make out patterns by going down the diagonal where we notice # square blocks of small numbers that indicate groups of clusterings # that are very close to each other. Here is the 13x13 square from the top left: mktg.km.discrep[1:13,1:13] #---------------- [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [1,] 0 810 809 806 814 791 794 792 823 821 821 836 835 [2,] 810 0 3 4 10 104 105 101 492 490 494 604 615 [3,] 809 3 0 7 7 101 102 98 491 489 493 603 614 [4,] 806 4 7 0 14 108 109 105 491 489 493 607 618 [5,] 814 10 7 14 0 94 95 91 490 488 492 600 611 [6,] 791 104 101 108 94 0 6 10 429 433 437 546 557 [7,] 794 105 102 109 95 6 0 6 424 428 432 541 552 [8,] 792 101 98 105 91 10 6 0 426 430 434 543 554 [9,] 823 492 491 491 490 429 424 426 0 6 10 207 218 [10,] 821 490 489 489 488 433 428 430 6 0 4 205 216 [11,] 821 494 493 493 492 437 432 434 10 4 0 202 213 [12,] 836 604 603 607 600 546 541 543 207 205 202 0 11 [13,] 835 615 614 618 611 557 552 554 218 216 213 11 0 #---------------- # We see that the first and best pattern is nothing like anything else. # The next 4 patterns form a block and are all close to each other. # Then comes a block of 3 patterns that are very close, # then another block of 3 patterns that are very close. # Did you notice what we did? We clustered the clustering patterns... # In light of this table it shouldn't be a surprise that clusterings # 2-5 looked about the same. Actually, even clusterings 6-9 have # small mismatch numbers with clusterings 2-5 percentagewise, # and even they look all alike. The next group of clusterings, 9-11, # look alike but different from everything else. # Summary points so far: # 1) K-means analysis has an issue (not a problem) of local minima, # and one needs methodology to get an overview of them: # collecting and analyzing many multiple restarts. # The analysis we used consisted of # a) listing and sorting the unique values of the criterion # b) checking that all clusterings for a given criterion value # are identical # c) plotting the clusterings of each criterion value as # colors in the first two principal components of the data # d) computing a mismatch matrix for all pairs of clusterings # e) comparing the mismatch matrix with the similarities among # the colored PC projection plots # 2) It seems the "best" clustering is not always the most interpretable. # At least that's what is suggested by the fact that the column 9 # segmentation that comes with the data is not numerically optimal. # Yet, the marketers who decided in its favor most likely had # interpretability in mind in their "suboptimal choice". # # 3) For comparing clusterings, one needs to align the numberings # of clusters across clusterings. But for alignment, one needs a # reference clustering. For the mktg data we used the column 9 # clustering as reference, but in other situations one may first # have to play with clusterings before one picks a reference for # aligning the other clusterings. #---------------- # # We continue with a systematic simulation study of # the geometry of kmeans clusterings and their (in)stabilities # Illustration: # Consider normal 2-D data with pre-defined correlations. # Plot the kmeans solutions for K=4 for varying correlations. N <- 20000 x1 <- rnorm(N); xa <- rnorm(N) # start with uncorrelated variables par(mfrow=c(6,6), mar=rep(.5,4))# for(co in seq(.00, .99, length=36)) { # loop over correlations ## for(co in seq(.6, .7, length=36)) { # zoom in on switch of pattern xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa)) xlim <- ylim <- 3.5*(c(-1,1)) clust <- kmeans(xx, xx[1:4,])$cluster plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") text(x=-3.4, y=3.4, lab=round(co,8), adj=0, cex=.8) } # Something happens between correlation 0.6 and 0.7: # an abrupt switch from one geometric pattern to another. # One could think of the pattern for lower correlations # as slicing a pie, for higher correlations as slicing # a cigar. # # If in a practical situation the data are correlated # in a way that is conducive to multiple near-optimal patterns, # they can be found with multiple restarts, as we did with # the marketing data. This shows that these patterns are # indeed local minima of the kmeans criterion: co <- 0.648 # (choose near the switch of patterns for current x1 and xa) xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa)) for(irestart in 1:36) { clust <- k.means(xx, K=4)$cluster # our function plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") } # # Q: What are the implications for statistical inference # for kmeans clustering? # Does bootstrap, for example, makes sense near correlation 0.6? # What is bootstrap going to tell you? # Let's try: pick a correlation near the switch point, and bootstrap. for(iboot in 1:36) { xb <- scale(xx[sample(N, replace=T),]) clust <- kmeans(xb, xb[1:4,])$cluster plot(xb, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") } # Indeed, bootstrap keeps flip-flopping between patterns, # with a strange attempt at a transition pattern at times. # Possible lesson: Yes, do bootstrap, but beware that the # multiplicity of solutions might be of a geometric nature. # # Come to think of it, we found multiple patterns in the # telecom marketing data also, just with multiple restarts! # The pattern had the colors blue and brown overlapping, # indicating that it needed a third dimension to accommodate # its four cluster centers: xlim <- ylim <- range(mktg.pca$scores[,1:2]) icrit <- 1 clust <- mktg.km.lst.red[[icrit]]$cluster pairs.plus(mktg.pca$scores[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust]) # It appears that blue and brown are best separated in the positive # diagonal direction of PCs 3 and 4, a fact that is somewhat obscured # by the green and red clusters that are smudged over everything. # #---------------- # # Summary points: # # 1) Patterns in k-means clusterings have to do with the # geometry of the point scatter. If the point scatter is just a # multivariate normal blob, the geometry is fully described by the # covariance structure. One should therefore not be surprised if # there is a connection between PCA and k-means partitions. # Both respond to the geometry of the best fitting ellipsoid # round the data. # # 2) For normal (Gaussian) point scatters it is quite plausible # that the k-means solutions are fully determined by the # covariance structure. A little more surprising is that # this seems to hold for such instances as the marketing # data as well, where most of the variables are discrete. # Who would have expected a pie-slicing pattern in the # first two PCs very similar to the bivariate normal simulation # with correlations below 0.6. # 3) Note that we said "covariance structure". In fact, # k-means is a metric method in that it relies on # Euclidean distances. Depending on how the variables # are scaled, one can get very different k-means results. # The issue should become clear by imagining data with two # variables whose standard deviations are 1 and 100, respectively. # K-means will consider this as a cigar and slice it on the # second variable, never mind the correlation. If one wants the # correlation structure to drive the clustering, one should # scale all variables to unit variance, so covariance and # correlation are identical. In general, one should make # sure the variables are on comparable scales, and the # conventional 'equalization' of scales is by standardization # of the variables (z-scores: subtract from each variable its mean, # divide it by its stdev). #---------------- # # Interpretation of cluster centers: # # Like regression coefficients and PCA loadings, # the kmeans centers are a major source for interpretation. # In what follows we plot cluster centers as profiles in # what is known as a "parallel coordinate plot": windows(wid=16, hei=12) par(mgp=c(1.8,.5,0), mar=c(2,2,1,1), mfcol=c(5,2)) for(icrit in 1:10) { # number of pattern; recall: icrit=5 ~ mktg[,9] crit <- mktg.km.crit.uniq[icrit] mktg.ctrs <- mktg.km.lst.red[[icrit]]$centers round(mktg.ctrs, 2) plot(x=c(.5,8.5), y=range(mktg.ctrs), type="n", xaxt="n", xlab=NA, ylab=NA) axis(side=1, at=1:ncol(mktg.ctrs), lab=colnames(mktg.ctrs)) lines(x=c(rbind(1:ncol(mktg.ctrs),1:ncol(mktg.ctrs),NA)), y=rep(c(par()$usr[3:4],NA),ncol(mktg.ctrs)), col="gray95") lines(par()$usr[1:2], c(0,0), col="gray95") for(i in 1:nrow(mktg.ctrs)) lines(mktg.ctrs[i,], col=c("red","green","blue","brown")[i]) text(x=par()$usr[2], y=par()$usr[3], lab=icrit, adj=c(1.5,-0.4)) text(x=par()$usr[2], y=par()$usr[4], lab=round(crit,3), adj=c(1.05,1.2)) } # The center coordinates indicate 'where' the cluster sits. # Judge their meaning in terms of: # - the first three variables: call volume # - the next three variables: dummies for needs # - age # - education # Which of these sets of centers is most interpretable? # Recall that we have natural groups of clusterings: # 1 by itself, 2-5 pretty much like mktg[,9], 6-8 also similar, # 9-10 different from all else. # (clustering 1 is interesting, and new to us: # red group by itself makes high volums, all else low vol; # blue high on business; green old age with low vol and no features) #---------------- # # Summary points about K-means clustering: # # - K-means is more often than not used to segment # data that do not fall into natural groups. # Segmenting of ungrouped data can be meaningful. # # - K-means arranges cluster centers in geometric patterns; # these patterns are local minima of the K-means criterion. # # - Which pattern solution one prefers depends not only on # the K-means criterion value but also on interpretability. # # - Flip-flopping between patterns can be an impediment # to statistical inference: one could conceive of # confidence regions for the cluster centers given # a fixed pattern, but the presence of multiple patterns # sabotages this idea. # # - PCA and K-means are naturally connected. In fact, there # exists theory according to which the K-means centers of # multivariate normally distributed data fall in PC # subspaces of large eigenvalues (Tarpey & Flury). # Therefore, plotting the first few PC projections with # color coded clusters may reveal the K-means patterns. # #---------------- # # Open questions: # # We have not addressed the choice of 'K', the number of # clusters. There exists a literature on this topic, # repleat with quasi-F-tests and the like. # I find these formal methods mostly irrelevant or even # misleading. If there are natural groups, you should # try to see them in a dimension reduction, such as a # scatterplot matrix of largest few principal component # projections. If this doesn't work try a non-linear # dimension reduction method such as multidimensional # scaling (MDS). [Ask the instructor about MDS; he loves # to talk about it.] # In soft sciences such as marketing and # social sciences in general, there are rarely natural # clusters present, hence clustering is really segmentation, # and the question of 'correct number of clusters' does not # exist. If anything, one should wonder whether the chosen # number of clusters is compatible with the geometry of the # data cloud in p-space. For example, I find K=5 generally # not congenial for situations where the first two PCs # cover a majority of variance. K-means then carves an # ellipsoid in 2-D into 5 slices, which just doesn't seem # right because it violates the symmetry of an ellipsoid. # If the data projected onto the first two PCs looks not # ellipsoidal but more like a polygon with odd number of # corners, then slicing into 3 or 5 might be sensible. # # #---------------------------------------------------------------- # # * ANOTHER TYPE OF CLUSTERING: HIERARCHICAL METHODS # # A very different type of cluster analysis is based on the idea # of hierarchical trees. Their purpose is to find natural groups, # unlike K-means, which is more often used for segmentation, that is, # carving up data that don't have natural groups. # Hierarchical methods work in a bottom up fashion: # they recursively form partitions of the data by # joining groups that are near each other in some sense. # They are also called 'agglomerative' because they # recursively form unions of pairs of groups. # Here are the steps: # - The input is a distance or dissimilarity matrix of size NxN. # Each entry is a proximity measure for a pair of points. # - One initializes the algorithm with the finest possible # partition in which every data point is its own group. # - One then joins the nearest two points to form the first # non-trivial group. # - Next one computes a distance measure of this two-point group # from all other one-point groups. This way one obtains an # (N-1)x(N-1) distance matrix. # - Now repeat: join the two nearest elements, which may be joining # another two points into a two-point group, or the existing # two-point group with another point into a three-point group. # - Which ever way, one again computes a distance of the newly # merged group from all others, resulting in an (N-2)x(N-2) # distance matrix. # - Recurse... # The output of this algorithms is represented as a hierarchical # tree which depicts all mergers of groups. # # Here is an example: the (famous) Swiss Fertility data # str(swiss) pairs.plus(swiss) rownames(swiss) # (Background: Cases = districts of French speaking Switzerland # in the late 1800s, the days of industrialization. # Social scientists, demographers and historians are # interested in the dependence of fertility on # socio-economics in that period. plot(Education ~ Catholic, data=swiss, pch=16, xlim=c(-20,120), ylim=c(-10,65)) with(swiss, identify(Catholic, Education, rownames(swiss))) # Again, one should scale the data first, then... swiss.dist <- dist(scale(swiss)) # see: help(dist) swiss.hclust <- hclust(swiss.dist) # see: help(hclust) plot(swiss.hclust) # Choose arguments for a less cluttered views: plot(swiss.hclust, hang=-1, main="", sub="", ylab="", xlab="", cex=1) # # The function 'hclust()' offers seven methods that # differ in how they form the reduced distance matrix after # each merger step: hclust.methods <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid") swiss.hclust <- hclust(swiss.dist, method="single") # see: help(hclust) # We'll try them out before we give more explanations: windows(12,8) par(mfrow=c(3,3), mgp=c(1.8,.5,0), mar=c(2,3,2,1)) for(meth in hclust.methods) plot(hclust(swiss.dist, method=meth), hang=-1, xlab="", main=meth, cex=.45) # # Here is the explanation from 'help(hclust)': # - WARD''S MINIMUM VARIANCE method aims at finding compact, spherical # clusters. # - The COMPLETE LINKAGE method finds similar clusters. # - The SINGLE LINKAGE method (which is closely related to the minimal # spanning tree) adopts a 'friends of friends' clustering strategy. # - The other methods can be regarded as aiming for clusters with # characteristics somewhere between the single and complete link # methods. # Here are details for some of the methods: # for two existing clusters A and B, the methods compute the # distance d(A,B) as follows. # - SINGLE LINKAGE: the minimum distance between points in A and B # - COMPLETE LINKAGE: the maximum distance between points in A and B # - AVERAGE LINKAGE: the mean of distances between points in A and B # - WARD'S METHODS: the mean of the squared distances between all # points in the union of A and B # - MEDIAN LINKAGE: the median of distances between points in A and B # These methods, except for the last, have simple updating rules # that facilitate the recursion: # - SINGLE LINKAGE: d(AuB,C) = min(d(A,C),d(B,C)) # - COMPLETE LINKAGE: d(AuB,C) = max(d(A,C),d(B,C)) # - AVERAGE LINKAGE: d(AuB,C) = (N_A*d(A,C)+N_B*d(B,C))/(N_A+N_B) # - WARD'S METHODS: SS(A,B) := sum{i in A, j in B} dij^2 # S(A) := SS(A,A) # V(A) := SS(A)/N_A/(N_A-1)/2 # d(A,B) := V(AuB) # Note: # SS(AuB) = SS(A)+SS(B)+SS(A,B) # hence one keeps track of all SS(A)'s and # all SS(A,B)'s and collects them in a diagonal # Which method gives the most useful results depends on the data # and on the purpose. # # As mentioned, a major purpose of these methods is to # find natural groups in multivariate data. Whether the # 'clusters' are natural or not should be checked graphically. # To this end, one needs a method for cutting a clustering tree # at a given level. The function 'cutree()' serves for this purpose: # it allows us to cut at a given 'height' of the tree # (on the vertical axis of the tree plot), # or for a given number of clusters. # We cut the 'swiss' fertility data into 6 clusters and # check what the various methods produce. # First we plot the tree once more: imeth <- 2 meth <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid")[imeth] # play with method swiss.hclust <- hclust(swiss.dist, method=meth) windows(8,8) plot(swiss.hclust, hang=-1, main=meth, xlab="", ylab="", sub="", cex=.75) # Cut the tree so we obtain six groups: swiss.grps <- cutree(swiss.hclust, k=6) # Tabulate so we know the sizes of the groups: table(swiss.grps) # Matrix-plot the data with color-coding of the groups: swiss.grps <- order(order(table(swiss.grps)))[swiss.grps] windows(10,10) clrs <- c("black","brown","green","blue","red","orange","magenta") pairs.plus(swiss, col=clrs[swiss.grps], main=meth, cex=.8) # (The first line is arcane: it orders the groups so their # number id is matched to colors: small groups in dark colors, # large ones in bright colors:) table(swiss.grps) # # Some comments on our experiences with these methods on this data: # - Ward's method does not work; there are no spherical clusters. # - 'Complete linkage' does a reasonable job: it isolates Geneva # and splits both catholics and protestants into two groups. # - 'Single linkage' does an extremely meaningful job: it # separates catholics and protestants in two groups and # isolates 3 singletons and one pair. # Single linkage is known to 'suffer' from 'chaining', that is, # clusters it creates can consist of long strands of points. # Maybe this isn't a real disadvantage. # Remaining mystery: what makes La Vallee so special?... ggobi #---------------- end of class # # Some artificial data with natural clusters, so we can find # out whether either K-means or hierarchical clustering finds them. # The following construction puts standard normal clusters at # 4 times the unit vectors (the vertices of the natural simplex): K <- 6 # number of natural clusters N <- K*100 # number of points per cluster = 100 p <- 6 # number of variables s <- 4 # cluster separation parameter: # center dists = s*sqrt(2)=5.66 clusdat <- matrix(rnorm(N*p), ncol=p) clusdat <- clusdat + s * ( ((row(clusdat)-1)%%K) == (col(clusdat)-1) ) pairs.plus(clusdat, cex=.3) # Make it clear that in each frame we are supposed to see two detached # clusters (top and right), and the four others plotted on top of each # other in the bottom left. # # - Does K-means find the clusters? clusdat.km <- k.means(clusdat, K=6) pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3) # Great! K-means finds them, if we give it the proper cluster number # What if we give it the wrong number? clusdat.km <- k.means(clusdat, K=2) pairs.plus(clusdat, col=clrs[3+clusdat.km$cluster], cex=.3) # K=2: Lumps clusters together. # What about too many clusters? clusdat.km <- k.means(clusdat, K=7) pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3) # some clusters are merged, others split... # K-means seems to be sensitive to choice of K. # # - Do hierarchical methods find the clusters? imeth <- 4 meth <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid")[imeth] # play with method clusdat.hc <- hclust(dist(clusdat), method=meth) #windows(8,8) plot(clusdat.hc, hang=-1, main=meth, xlab="", ylab="", sub="") # Indeed, they all do. Seems the problem was too easy. # We could reduce the separation of the centers, # which was large: 4*sqrt(2)=5.66 # At least there is comfort in the fact that clustering methods # do find the obvious natural clusters. # # #================================================================ KERNELIZING INTRO * Purpose: Background for - smoothing splines and their generalizations - generalized quadratic penalty methods Source: Notes by Xin Lu * We assume that you know some fundamentals of Hilbert spaces: They are inner product spaces whose topology (1) is generated by the associated norm and (2) is complete, i.e., all Cauchy sequences have limits. * Fundamental notion: Reproducing Kernel Hilbert Space (RKHS) - Definition: A 'RKHS' is a Hilbertspace of functions on a domain X such that function evaluations are bounded (continuous) linear functionals, i.e.: |f(x)| <= C(x)*||f|| [Pedantically: Define a linear functional L(f) := f(x) for a fixed but arbitrary location x in the domain X. The linear functional is bounded (hence norm-continuous) iff there exists a constant C such that |L(f)| <= C*||f|| for all f in the Hilbert space of functions.] - Fact: L2 spaces cannot be RKHSs if there exist non-empty sets of measure zero. Why? ... Function evaluation is not even well-defined. (L2 spaces can be RKHSs, e.g., if the domain is discrete and all atomic events have non-zero probability.) - Fact: RKHS norm-convergence implies pointwise convergence: ||fn - f|| --> 0 ==> fn(x) --> f(x) for all x Why? Because |fn - f| <= C(x) |fn(x - Definition: Let H be a Hilbert space of functions on a domain X. A 'reproducing kernel of H' is a function k(x,y) on X x X that satisfies (1) k(x,.) is in H for all x in X (2) = f(x) . Property (2) is called the 'reproducing' property. . k(x,y) is necessarily symmetric because <.,> is symmetric: k(y,x) = = = k(x,y) - A Hilbert space of functions H is a RKHS iff it has a reproducing kernel. '=>': If H is a RKHS, then by the Riesz Representation Theorem there exists for every x a function k(x,.) in H that 'represents' the linear functional L(f)) = f(x): L(f) = = f(x). So k(x,y) satisfies (1) and (2). '<=': If H is a Hilbert space of functions and there exists a reproducing kernel k(x,y) in it, then evaluation functionals are bounded: |f(x)| = || <= ||f||*||k(x,.)|| by the C-S inequality. Hence ||k(x,.)|| < Inf is a bounding constant C(x) because k(x,.) is in H. - Definition: A 'kernel' k(x,y) is a function on X x X for which there exists a (not necessarily reproducing) Hilbert Space F (called the 'feature space') and a 'feature map' Phi: X -> F such that: k(x,y) = _F . Simple example: Let f1(x),...,fK(x) be real-valued functions on X. Let F = Reals^K with the Euclidean inner product <.,.>. Define Phi(x) = (f1(x),...,fK(x)), X --> F. Then k(x,y) = = sum_k fk(x)*fk(y) defines a kernel. . More complex example: Let L2(Z,mu) be an L2 space on a domain Z with measure mu(dz). Let h(x,z) be such that h(x,.) is in L2(Z,mu) for all x in X. Then x --> h(x,.), X --> L2(Z,mu) defines a feature map, and the associated kernel is k(x,y) = Integal h(x,z)*h(y,z) mu(dz) . Fact: Every reproducing kernel is a kernel. Proof: Set Phi(x) = k(x,.) and F=H, so that k(x,y) = _H. . We may also require that Phi(x1),...,Phi(xn) are lin.indep. for any distinct x1,...,xn and arbitrary n. This would be satisfied, e.g., if X=Reals and Phi(x)=(1,x,x^2,x^3,...). - Definition: A 'positive definite function' k(x,y) is a function that is symmetric, k(x,y) = k(y,x), and for which all matrices K = (k(xi,xj))_{i,j=1,...n} are positive semi-definite for all n and all choices x1,...,xn: sum_ij ai aj k(xi,xj) >= 0 for all (a1,...,an). . Fact: Every kernel obtained from a feature map Phi(x) is a pos.def. function. Proof: sum_ij ai aj k(xi,xj) = sum_ij ai aj _F = < sum_i ai Phi(xi), sum_j aj Phi(xj) >_F = ( || sum_i ai Phi(xi) ||_F )^2 >= 0 always. - THEOREM (Moore-Aronszajn): For any positive definite function k(x,y) there exists a unique RKHS with reproducing kernel k(x,y). Fact: H = topological completion of span{k(x,.): x in X} 'Proof': Define H_0 := span{k(x,.): x in X} For f,g in H_0 we have: f(.) = sum_i ai k(xi,.) g(.) = sum_j bj k(yj,.) where both {xi} and {yj} are finite sequences of elements in X. Def.: := sum_ij ai bj k(xi,yj) Fact: = sum_j bj f(yj) = sum_i ai g(xi) Issue: At this point we don''t know whether this is a coherent definition. What if f1 = f2 has two different representations with different locations x1i and x2i and different coefficients a1i and a2i? Could it then be that =/= ? It is not possible. Check this: (1) = sum_j bj f1(yj) (See above Fact) (2) = sum_j bj f2(yj) ( " " ) If f1(x)=f2(x) for all x, then (1) = (2). QED Fact: = k(x,y) (Set {xi}={x}, {yj}={y}) Def.: ||f||^2 := = sum_ij ai aj k(xi,xj) Fact: ||k(x,.)||^2 = k(x,x) Fact: ||f||^2 >= 0 (k(x,y) is pos.def.) Fact: <= ||f||*||g|| (C-S, from previous line) Fact: = f(x) (reproducing property) Proof: = sum ai = sum ai k(x,xi) = f(x) (A HIGHER TRIVIALITY, isn''t it?) Fact: |f(x)| <= ||k(x,.)||*||f|| (from f(x) = and C-S) (thus evaluation functionals are bounded) Fact: ||f|| = 0 ==> f(x)=0 for all x (see previous Fact) Fact: ||fn|| --> 0 ==> fn(x) --> 0 pointwise (from boundedness of eval.) Completion of the space H_0: Let fn be Cauchy sequence wrt the norm: ||fn-fm|| --> 0 as n,m-->Inf Then, for each x, fn(x) is a Cauchy sequence of real numbers (see previous Fact), hence has a limit: fn(x) --> f(x) for some function f. Let H be the space obtained by including all limits of Cauchy sequences in H_0. It can be shown that the inner product can be uniquely extended to H: := lim More details are needed... - Thus we have come full circle: . RKHSs have reproducing kernels. . Reproducing kernels are kernels (are representable with feature maps Phi(x)). . Kernels are positive definite functions. . Every positive definite function defines a unique RKHS. * The Freedom of Kernelizing: - The feature representation of kernels is a great help in constructing RKHSs - Kernel algebra: positive semi-definite kernels! . k(x,y) = c > 0 . k(x,y) = f(x)*f(y) for arbitrary f . k(x,y) = k1(x,y) + k2(x,y) . k(x,y) = k1(x,y) * k2(x,y) . k(x,y) = lim kn(x,y) pointwise are all kernels. Only the product case requires a proof: Given two pos. semi-def. nxn matrices K1 and K2, let their eigen-decomps be K1 = sum_i lambda1i * u1i u1i^T K2 = sum_j lambda2j * u2j u2j^T We will use "." for the componentwise product of matrices and vectors: K1 . K2 = sum_ij lambda1i lambda2j * (u1i u1i^T) . (u2j u2j^T) = sum_ij lambda1i lambda2j * (u1i . u2j) (u1i . u2j)^T Because lambda1i, lambda2j >= 0, the product is >= 0, and the matrices are again positive semi-def. matrices of rank 1. QED [Matt says there is a shorter proof via covariance matrices of random vectors] - Derived kernel rules: For any kernel k(x,y), . k(x,y)^n for n>=0 . exp(k(x,y)) are also kernels. - Kernels on R^K: . k(x,y) = Euclidean inner product kernel on R^K . ^2 . exp(-||x-y||^2) Standard Gaussian Kernel . exp(-||x-y||^2/(2*sigma^2)) Gaussian Kernel with bandwidth sigma * From functional analysis to linear algebra: - The Kimmeldorf-Wahba observation (early 1970s): Theorem: Let H_n := span{k(x1,.),...,k(xn,.)}. The orthogonal complement of H_n is { g in H: g(x1)=...=g(xn)=0 } Proof: g _|_ H_n <==> = 0 for i=1...n <==> g(xi) = 0 for i=1...n - Typical statistical regularization problems look like this: Given a loss function sum_i l(yi|f(xi)) and a RKHS penalty ||f||^2, solve the following problem: -------------------------------------------------------------- | argmin_{f in H} [ sum_i l(yi|f(xi)) + lambda*||f||^2 ] | (*) -------------------------------------------------------------- - Theorem: A solution of (*) can always be found in H_n = {k(x1,.),...,k(xn,.)}. Proof: Decompose a given f as follows: f = fn + g where fn in H_n and g _|_ H_n. Then: ||f||^2 = ||fn||^2 + ||g||^2 f(xi) = fn(xi) + g(xi) = fn(xi) Thus: ||fn||^2 <= ||f||^2 sum_i l(yi|f(xi)) = sum_i l(yi|fn(xi)) - Reduction of (*) to linear algebra/n-dim minimization: . Notations: yvec := (y1,...,yn)^T n-vector fvec := (fn(x1),...,fn(xn))^T n-vector K := (k(xi,xj))_{i,j=1...n} nxn matrix . Represent fn(.) in terms of k(xi,.): fn(.) = sum_i bi * k(xi,.) More notation: b := (b1,...,bn) n-vector Using the above notations fvec and K we get: -------------- | fvec = K b | -------------- . Loss: L(yvec|fvec) := sum_i l(yi|f(xi)) ------------------------ . Fact: | ||fn||^2 = b^T K b | ------------------------ Proof: ||fn||^2 = = sum_ij bi*bj* = sum_ij bi*bj*k(xi,xj) = b^T K b . Reduced version of problem (*): --------------------------------------------------- | argmin_b [ L(yvec|K b) + lambda * b^T K b ] | (**) --------------------------------------------------- Back in terms of fvec = K b: Assume K^{-1} exists, so b = K^{-1} fvec. ------------------------------------------------------------------ | argmin_fvec [ L(yvec|fvec) + lambda * fvec^T K^{-1} fvec ] | (***) ------------------------------------------------------------------ . Connection between penalty matrices and RKHS kernel: ------------------ | Omega = K^{-1} | ------------------ . Solve normal equations for LS: L(yvec|fvec) = |yvec - fvec|^2 ----------------------------------------- | fvec = (I + lambda * Omega)^{-1} yvec | ----------------------------------------- This is old hat, the well-known formula for the smoother matrix S: S = (I + lambda * Omega)^{-1} . Extrapolation to locations x other than the observed xi: -------------------------------- | b = K^{-1} fvec = Omega fvec | | | | f(.) = sum_i bi * k(xi,.) | -------------------------------- . Direct solution for b: argmin_b [ |yvec - K b|^2 + lambda * b^T K b ] 1/2 Gradient wrt b: - K yvec + K^2 b + lambda * K b = 0 (+) To cancel one of the common factors K, we must constrain to the orthogonal complement Null(K)^{perp} = Range(K). To this end let P_K be the orthogonal projection onto Range(K): - P_K yvec + K b + lambda b = 0 -------------------------------------- | b = (lambda * I + K)^{-1} P_K yvec | -------------------------------------- The inverse is well-defined on Range(K). This particular 'b' is one solution. However, if Null(K) =/= {0}, then any element of Null(K) can be added to b and it is still a solution to (+). - Examples: y <- boston[,"MEDV"] x <- boston[,"LSTAT"] N <- length(x) . Gaussian kernel smoothing on the real line # Constructing a Gaussian kernel: sigma <- sd(x)/1 # To be tuned... K <- exp( -outer(x, x, FUN="-")^2 / (2*sigma^2) ) # Solving for the coefficients 'b': lambda <- .001 # Also to be tuned... b <- solve(lambda * diag(N) + K) %*% y f <- K %*% b # The result: plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV") ord <- order(x) lines(x[ord], f[ord], lwd=3, col="red") Note: Gaussian kernels require two tuning parameters. . sigma, the width of the Gaussian kernel, . lambda, the strength of the penalty. [If you hear someone give a talk presenting a Gaussian kernel example, ask how they chose the kernel width AND the penalty strength!] Results are mildly satisfying, but what''s the downward bend on the left? These choices of sigma and lambda took some experimenting. ---------------- End of class . Linear regression with a linear kernel: # Constructing a linear kernel: Try both kernels. K <- 0 + outer(x, x) # Linear regression through the origin K <- 1 + outer(x, x) # Linear regression with intercept # Solving for the coefficients 'b': lambda <- .01 # Play with lambda: Crank it up/down to 1E6 and 1E-6 b <- solve(lambda * diag(N) + K) %*% y f <- K %*% b # The result: plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV") ord <- order(x) lines(x[ord], f[ord], lwd=2, col="red") # Why the difference between the two kernels? # Does it matter what non-zero constant you add? # How does the penalty strength matter? . Quadratic regression with a quadratic kernel: # Constructing a linear kernel: Try both kernels. K <- (1 + outer(x, x))^2 # Quadratic regression with intercept # Solving for the coefficients 'b': lambda <- 10 # Play with lambda: Crank it up/down to 1E6 and 1E-6 b <- solve(lambda * diag(N) + K) %*% y f <- K %*% b # The result: plot(x, y, pch=16, xlab="LSTAT", ylab="MEDV") ord <- order(x) lines(x[ord], f[ord], lwd=2, col="red") # Why the difference between the two kernels? # Does it matter what non-zero constant you add? # How does the penalty strength matter? Not bad! Compare with LS fit: lines(x[ord], fitted(lm(y[ord] ~ x[ord] + I(x[ord]^2))), lwd=2, col="green") . Irresponsible step: Assuming that Omega = K^{-1} exists. Can''t K have zero eigenvalues? Examine: An eigen vector u=(u1...uN)^T for e''val=0 would mean K u = 0 In components: sum_j uj k(xi,xj) = 0 for i=1...n This means the function sum_j uj k(xi,.) is orthogonal to k(xi,.) for all i. But the function is also in the span of the functions k(xi,.), hence it is zero: sum_j ui k(xi,.) = 0 But this does not mean the coefficients ui are zero! Consider the linear kernel k(x,y)=x*y: All functions k(x,.) are linearly dependent. * Sobolev RKHS: - Define squared norms and inner products directly: ||f||^2 := Integral_[0,1] (f^(m))^2 dx := Integral_[0,1] f^(m) * g^(m) dx where f^(m) is the m''th derivative of f. ||.|| is only a semi-norm, though, because there exist non-zero functions f with ||f||=0, namely, the polynomials up to degree m-1. - What is the connection with RKHS? . Consider the functions with m derivatives s.th. f^(m) in L2([0,1],dx) SATISFYING THE BOUNDARY CONDITIONS f(0) = f^(1)(0) = ... = f^(m-1)(0) = 0. . This is obviously an L2 space isomorphic to L2([0,1],dx) Call it H_1. . Its evaluation functionals are bounded: f(x) = Integral_[0,x] (x-t)^{m-1} / (m-1)! * f^(m)(t) dt <= C(x) * ||f^(m)|| by C-S. [The first line stems from repeated partial integration that leads to the Taylor expansion with m''th order remainder term: while increasing derivatives of f, increase the anti-derivatives of (x-t). This choice will ensure that the boundary values of (x-t)^l*f^{k}(t) evaluated at t=0 and t=x vanish: at t=0 by the assumed boundary conditions, at t=x because (x-t)^l vanishes for t=x.] . It follows that H_1 a RKHS. - Heuristic observation: The integral above can be written as f(x) = Integral_[0,1] (x-t)_{+}^{m-1} / (m-1)! * f^(m)(t) dt Note the truncated spline functions: G(x,t) := (x-t)_{+}^{m-1} / (m-1)! (*) f(x) = Integral_[0,1] G(x,t) * f^(m)(t) dt (**) - What is the kernel? A: k(x,y) = Integral_[0,1] G(x,t) G(y,t) dt (***) To prove: (1) k(x,.) is in H1 (2) = f(x) for f in H1 (1): Is the m''th derivative of k(x,.) in L2([0,1],dt)? Take m derivatives of y-->k(x,y) by taking m derivatives of y-->G(y,t) inside (***), which ends up being the Dirac one-point mass at t=y. This is heuristic, but one can also just reverse the partial integrations to get here. Hence the m''th derivative in the second argument is: k(x,.)^{m) = G(x,.) This is in L2([0,1],dx). (2): Reproducing property: = Integral_[0,1] k(x,t)^{m}*f^{m}(t) dt = Integral_[0,1] G(x,t)*f^{m}(t) dt = f(x) by (**) above. - But how does this add up to a full space of fits w/o boundary conditions? . Expand the space H_1 by adding polynomials up to degree m-1. Call their space H_0. They get zero penalty. So we have a mix of space-restricted LS and penalized LS: f = f0 + f1 where f0 is a polynomial of degree m-1 and f1 is in the RKHS. . For computations you you need to re-derive the normal equations for the mixed case that includes a space-restricted LS-component f0 and a penalized component f1. * KERNEL PCA: - Kernel PCA is just the eigen decomposition of a multivariate dataset replaced by a kernel matrix thereof. - Classical PCA is obtained from the kernel k(x,y) = x^T y where x and y are p-vectors of variables. - You may obtain 'quadratic PCA' by using a kernel of the form #---------------- # Infrastructure: norm <- function(x) sqrt(sum(x^2)) # Euclidean norm normalize <- function(x) x/sqrt(sum(x^2)) # For mapping points to unit sphere to.01 <- function(x) (x-min(x))/(max(x)-min(x)) # For mapping coords to rgb colors dist.mat <- function(X) as.matrix(dist(X)) # Convert the distance structure to a matrix kNN.crit <- function(dists.base,dists.rel,k=10) { # Compare kNN neighborhoods of two dist matrices qtile <- k/ncol(dists.base) kNN.base <- apply(dists.base, 2, function(dcol) as.numeric(dcol <= quantile(dcol, qtile))) kNN.rel <- apply(dists.rel, 2, function(dcol) as.numeric(dcol <= quantile(dcol, qtile))) apply(pmin(kNN.base, kNN.rel), 2, sum) / apply(kNN.base, 2, sum) } #---------------- # # Two artificial data examples: # * Circular data, equispaced, but 1-frac removed: N <- 1000; frac <- .5 tau <- sample(seq(0,2*pi, length=N+1)[-1][1:(N*frac)]) # Scrambled order X <- cbind(cos(tau), sin(tau)) clrs <- rgb(red=to.01(X[,1]), green=to.01(X[,2]), blue=0) # Colors dim(X) # Spacing, relevant for sigma: 2*pi/N # * Spherical data, random, but 1-frac cap removed on last coordinate: N <- 1000; p <- 3; frac <- 1 tmp <- t(apply(matrix(rnorm(N*p), ncol=p), 1, normalize)) X <- tmp[tmp[,p]<=quantile(tmp[,p], frac),] clrs <- rgb(red=to.01(X[,p]), green=0, blue=0) # Colors dim(X) # Plot original data: windows() lims <- range(X); par(mfrow=c(1,1)) plot(as.data.frame(X), pch=16, cex=.8, col=clrs, xlim=lims, ylim=lims) #---------------- # Kernels: # * QUADRATIC KERNEL: K <- (1000+tcrossprod(X))^2 # Then execute the code below headed "EIGENANALYSIS..." and "PLOTS..." # Lessons: # - The 1st eigenvector is a constant accounting for location. # - The 2nd and 3rd eigenvector reproduce the circle quite well. # - Correspondingly there is one large eigenvalue for the constant # and two identical eigenvalues for the sine and cosine motion. # * GAUSSIAN KERNEL: dists <- dist.mat(X) # these are raw, NOT squared distances # What is the max of the m*N smallest distances? Relevant for kernel width 'sig': m <- 3; quantile(x=dists[col(dists) < row(dists)], prob=(m*N)/choose(N,2)) # Kernel width, must be chosen every time: sig <- 10 K <- exp(-dists^2/sig^2) # round(K[,1:6], 3) # checking bandedness, e.g. # Lessons? #---------------- # EIGENANALYSIS of kernel: K.eig <- eigen(K); K.eig$vectors <- t(t(K.eig$vectors)*sqrt(pmax(0,K.eig$values))) colnames(K.eig$vectors) <- paste("Evec",1:ncol(K.eig$vectors)) # PLOTS of a few eigen coordinates: rk <- 1:4 # embedding dimensions (ranks) shown lims <- range(K.eig$vectors[,rk]) lims <- range(K.eig$vectors[,rk[-1]]) pairs(K.eig$vectors[,rk], xlim=lims, ylim=lims, gap=0, oma=rep(2,4), col=clrs, pch=16, cex=.5) #pch=".") # EIGENVALUES, up to 20 largest: eval.ord <- 1:min(N,20) par(mfrow=c(3,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(x=eval.ord, y=K.eig$values[eval.ord], type="b", pch=16) # raw scale plot(x=eval.ord, y=sqrt(K.eig$values[eval.ord]), type="b", pch=16) # root scale plot(x=eval.ord, y=log(K.eig$values[eval.ord]), type="b", pch=16) # log scale round(K.eig$values[eval.ord], 8) # META-CRITERION of kNN overlap: first is base, second is relative to base rk <- 1:2 # dimensions used in kernel embedding mean(kNN.crit(dist.mat(X), dist.mat(K.eig$vectors[,rk]), k=10)) # recall mean(kNN.crit(dist.mat(K.eig$vectors[,rk]), dist.mat(X), k=10)) # precision # PLOTS of eigen coordinates against curve parameter tau for curve data such as circle: par(mfrow=c(2,2), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) for(j in 1:4) plot(tau, K.eig$vectors[,j], pch=16, type="p", ylim=lims, ylab=paste("Eigenvector",j)) # PLOTS of eigen coordinates against original coordinates: par(mfrow=c(2,2), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot(X[,1], K.eig$vectors[,1], pch=16, type="p") plot(X[,2], K.eig$vectors[,1], pch=16, type="p") plot(X[,1], K.eig$vectors[,2], pch=16, type="p") plot(X[,2], K.eig$vectors[,2], pch=16, type="p") ----------------------------------------------------------------