################################################################ # # PCA (PRINCIPAL COMPONENT ANALYSIS) FOR DIMENSION REDUCTION # # PCA, unlike regression, does not single out a response variable. # Instead, PCA is a symmetric treatment of all variables one # decides to include. # Q: If there is no response to approximate (whose approximation # can be optimized), what could PCA try to achieve? # A: PCA tries to find linear combinations of the variables (all # of them) that have large variance. # However, this is an ill-defined goal because variance can # always be made to go to infinity by letting any coefficient # go to infinity. # ==> We need a constraint on the coefficients. # How would you constrain the coefficients? # (We assume you don't know PCA yet.) # Conventional answer: Constrain the Euclidean norm of the coeffs. # # Hence the optimization problem is: # # Var(X b) = max subject to |b|^2=1 # # Or, equivalently: # # Var(X b) # -------- = max_b (or: ... = stationary in b) # |b|^2 # # Assuming the columns of the data matrix X have been centered, # we have: # # Var(X b) = b' C b, # # where C = X'X/(N-1) is the matrix of covariance estimates from X. # # Obtain the stationarity conditions: # # d/db [ (b' C b) / |b|^2 ] = 0 # # ==> C b = lambda b, where lambda = (b' C b) / |b|^2 # # That is, the solutions are the eigenvectors of the covariance matrix, # and the eigenvalues are the variances of the lin.combs. defined by # the unit eigenvector. # # Keep in mind the interpretation of b' C b: # b' C b = Var(X b) = estimated variance of the lin.comb. X b # # Toy Example: # Assume X consists of two columns, both with m=0, sd=1 and cov = rho. # Q: What are the (2-D) principal components? # A: # # Think of a multivariate dataset as points in p-dim space. # If the variables are highly correlated, the point cloud # will be rather flat, that is, intrinsically approximately of # fewer dimensions than 'p'. PCA dimension reduction has # the goal of finding the few essential linear dimensions, # essentially by constructing a COORDINATE SYSTEM in p-space # whose 1st coordinate has the largest variance, # whose 2nd coordinate has the second larges variance (orth. to the 1st), # ... # # Remaining issue: Units of the columns of X. # If the units of column 1 are, say, miles, and we change them # to inches, we get much larger numbers with "much larger variance". # Q: How do we give each variable an equal say, independently of units? # A: By standardizing the columns to unit variance. # In other words, we don't perform an eigen analysis of the covariance # but of the correlation matrix of a data matrix! # (Exception: When all variables have the same units, # one might want to use the cov matrix.) # # To solve the PCA problem in practice, one simply performs an # eigen decomposition of the correlation matrix: eigen(cor(boston)) # Alternatively, there exists a PCA routine in the R package mva. # Here, however, we will use the following routine which has some nice # labeling of the results as well as some other niceties: pca <- function(x, scale=T, center=T, proj=T) { nobs <- nrow(x); nvars <- ncol(x) x <- scale(x, scale=scale, center=center) s <- svd(x/sqrt(nobs-1)) # svd() instead of eigen() pca.lst <- list(Evals=s$d^2, Loadings=t(t(s$v)*s$d)) if(proj) pca.lst$Projections <- t(t(s$u)*s$d)*sqrt(nobs-1) names(pca.lst$Evals) <- paste("Eval ",1:nvars,sep="") dimnames(pca.lst$Loadings) <- list(colnames(x),paste("Load ",1:nvars,sep="")) if(proj) dimnames(pca.lst$Projections) <- list(rownames(x),paste("Proj ",1:nvars,sep="")) for(j in 1:nvars) if(sum(pca.lst$l[,j]) < 0) # Sign change for interpretability { pca.lst$Loadings[,j] <- -pca.lst$Loadings[,j]; if(proj) pca.lst$Projections[,j] <- -pca.lst$Projections[,j] } pca.lst } # The function returns a list with a these components: # - e'values or principal variances that describe the # 'fatness' of the data along each PC direction # described by PC coefficients; # - a matrix of PC coefficients, also called 'loadings'; # the columns are coefficient vectors that define linear # combinations; the columns are normalized so their # squared lengths are the same as their respective # eigenvalues or variances (this is a fixed convention); # - a matrix of the same size as the data matrix containing # the data represented in the PC coordinate system; # the PC coordinate system is orthogonal and data # are uncorrelated in it, but the sd's should be the roots # of the e'values. # DATA EXAMPLE: Marketing survey data url.buja <- "http://stat.wharton.upenn.edu/~buja/STAT-541/" mktg <- read.table(header=T, file=paste(url.buja,"mktg.dat",sep="")) dim(mktg) names(mktg) pairs.plus(mktg, gap=0, cex=.1) # Reminder: pairs.plus() is from pairs.plus(mktg, gap=0, pch=".") # the above website. # The first three variables are quantitative, # the next three binary, # the last three multi-categorical. # In what follows we should only use the first 8 variables because # the last was generated from the others with k-means clustering. # PCA: mktg.pca <- pca(mktg[,1:8]) names(mktg.pca) # Here is how to use the output: # # 1) Eigenvalue Profile: length(mktg.pca$Eval) plot(mktg.pca$Eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data", ylim=c(0,max(mktg.pca$Eval))) # One can also easily eyeball the values: round(mktg.pca$Eval, 3) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 2.640 1.689 1.034 0.812 0.673 0.527 0.422 0.203 # It shows that the largest principal components has a variance of 2.640, # the second largest a variance of 1.689,... # The sum of the variances equals the number of the variables: sum(mktg.pca$Eval) # (For those who know lin.alg.: an orthogonal coordinate transformation # preserves the trace of a linear map, here the correlation matrix # which has 'p' 1s in the diagonal.) # Therefore, the eigenvalue profile has a 'budget' of size 'p', which # it allocates to the PCs in descendig order. In the extreme case # of a totally uncorrelated set of input variables, all eigenvalues # are 1, and the eigenvalue profile is completely flat. # (Reason: the correlation matrix is the identity, which remains the # same under an orthogonal coordinate transformation.) # Eigenvalue profiles are often associated with cumulative variances: round(cumsum(mktg.pca$Eval)/sum(mktg.pca$Eval)*100, 1) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 33.0 54.1 67.0 77.2 85.6 92.2 97.5 100.0 # One says that the first PC accounts for 33% of the variance, # the first two PCs for 54.1% of the variances,... # # Since variance is squared standard deviation, the root of # the eigenvalues is a better reflection of the spread of the # data in the eigendirections: plot(sqrt(mktg.pca$Eval), pch=16, type="b", ylab="Root-Eigenvalues of Mktg Data", ylim=c(0,max(sqrt(mktg.pca$Eval)))) # So this look a little less dramatic: round(sqrt(mktg.pca$Eval), 2) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 eval 7 eval 8 1.62 1.30 1.02 0.90 0.82 0.73 0.65 0.45 # The ratio of largest to smallest standard deviation is only 1.62 / 0.45 [1] 3.6 # # 2) Eigenvectors: dim(mktg.pca$Load) round(mktg.pca$Load, 3) load 1 load 2 load 3 load 4 load 5 load 6 load 7 load 8 in. 0.713 0.429 -0.090 -0.003 0.133 0.162 -0.503 0.053 out 0.850 0.337 -0.102 0.001 0.020 -0.084 0.179 -0.337 rev 0.812 0.378 -0.076 -0.053 -0.072 -0.120 0.291 0.292 bus 0.401 -0.404 0.457 0.497 0.451 -0.125 0.019 0.028 reach 0.473 -0.482 -0.039 0.424 -0.574 0.180 -0.043 -0.002 kids 0.364 -0.628 -0.342 -0.261 0.296 0.426 0.136 0.014 age -0.378 0.672 0.184 0.330 0.060 0.478 0.175 -0.004 edu 0.301 -0.088 0.805 -0.454 -0.161 0.147 -0.001 -0.018 # The idea is to interpret the coefficients based on their # signs and magnitudes: # - The first PC direction (column vector) is strongly # driven by the first three variables, those that measure # overall usage volumn. To a lesser degree, it also relies # on the needs variables 'bus', 'reach', 'kids', then on # inverse age, and finally on education. Thus the first PC # is a direction that measure overall 'interestingness to # the company': this PC is large if the usage volumn is large, # if there are needs, if the household is young and well- # educated, all of which makes a customer household interesting. # - The second PC direction is a contrast between usage volumn # on the one hand and the other variables on the other hand. # This PC is large if usage is high but all the qualitative # variables are uninteresting. # - I wouldn't interpret the 3rd through 7th PC; # they are close to 1 and to each other, # which makes them ill-determined. # (If two eigenvalues are identical, # they have infinitely many eigenvectors.) # - The smallest PC is largely a difference between 'out' and # 'rev', which makes sense: if after standardization the # difference 'out-rev' is small, it means out ~ rev, # which is confirmed by plot(scale(mktg[,c(2,3)]), pch=16); abline(a=0,b=1) # # 3) PC Projections: windows(10,10); par(mgp=c(1.8,.5,), mar=c(3,3,2,1)) xlim <- ylim <- range(mktg.pca$Proj) pairs.plus(mktg.pca$Proj, pch=".", cex.lab=.5, xlim=xlim, ylim=ylim, gap=.1) # It is mandatory that the limits be fixed for all frames so that # the relative dispersions are visible! The smallest principal # components should show their small spreads as reflected by their # small variances (=eigenvalues). # Default-scaling of the projections according to their ranges # is a flaw of many PCA projection plots even in publications! # [If the data had the shape of a pancake in p-space, then the # first two dimensions would show the pancake 'top-down'. # The remaining dimensions would show it 'over the edge'. # The mktg data here are not very 'flat', hence not very # pancake-shaped.] # The striations in some of the pictures should not worry us: # they correspond to layers formed by the discrete variables # 'bus', 'reach', 'kids', 'age', 'edu'. # An interesting use of PCA for this dataset is to show the # nature of the segmentation: pairs.plus(mktg.pca$Proj, pch=".", cex=1, xlim=xlim, ylim=ylim, col=c("red","green","blue","orange")[mktg[,"seg"]], gap=.1) # Here is an enlarged version of the largest two PCs for # interpreting the segments: par(mfrow=c(1,1), mgp=c(1.8,.5,0), mar=c(3,3,1,1)) plot.plus(mktg.pca$Proj[,1:2], pch=16, cex=.5, xlim=xlim, ylim=ylim, col=c("red","green","blue","orange")[mktg[,"seg"]]) # It shows that the 'kmeans' clustering algorith used to form the # segments does not find separated clusters. Most surprisingly, # it does not find the 'natural' clusters formed by the levels # of the binary variables 'bus', 'reach', 'kids'. Rather, it # roughly carves up the 'pie' in the two largest dimensions # found by PCA. # Similarly, one can color business use, kids, reach needs, # even discretized usage: windows(wid=12, hei=6) par(mfrow=c(2,4), mar=c(1,1,2,2), mgp=c(1.5,.5,0)) xlim <- ylim <- c(-5,5) for(i in names(mktg)[1:8]) plot.plus(mktg.pca$Proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, main=i, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green")[(mktg[,i]>median(mktg[,i]))+1] ) # In view of the tilted color patterns one would like to rotate the # PCA projections shown here. Users of PCA and other 'factor analysis' # techniques indeed do apply so-called 'factor rotations' to enhance # interpretability of the loadings and projections. # Some of these rotation methods are known under names such as # 'Varimax' and 'Quartimax', which refer to optimization criteria # for rotation. # We will be less systematic here and do an ad hoc rotation of the # first two PCs so the color distributions 'look right': windows(wid=12, hei=6) par(mfrow=c(2,4), mar=c(1,1,2,2), mgp=c(1.5,.5,0)) xlim <- ylim <- c(-5,5) ang <- -pi/6 rot <- cbind(c( cos(ang), sin(ang)), # 2x2 rotation matrix c(-sin(ang), cos(ang)) ) for(i in names(mktg)[1:8]) plot.plus(mktg.pca$Proj[,1:2] %*% rot, pch=16, cex=.2, xlim=xlim, ylim=ylim, main=i, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green")[(mktg[,i]>median(mktg[,i]))+1] ) # Pretty good! The three volume variables are separating left-right, # while the remaining qualitative variables are separating vertically. # (Note that after such rotation the hor and vert axes are not longer # uncorrelated!) # Remaining question: Do the rotated loadings confirm the pictures? scale(mktg.pca$Load[,1:2],center=F) %*% rot # (Careful: The loading/eigenvecs are by convention not normalized # but scaled proportional to the principal sd's! # For rotation they must be equalized in length.) # Indeed, the first lin. comb. is now mostly an average of in./out/rev, # while the second lin.comb. is mostly an average of the # qualitative variables with reversal of 'age', as is meaningful, # and only weak contribution by 'edu'. # # Summary of concept: Rotation of factors for interpretation. ################ # # PCA -- a higher principle for deriving the PCA criterion # # Most people are happy to understand PCA as optimization of # the variance of lin. combs. subject to a unit norm constraint: # Var(X b)/|b|^2 = Var(X b/|b|) # Also, they will be happy with the reasoning that the unit # should be the sd, that is, the variables should be standardized. # Still, is there no more principled approach to the foundations # of PCA? To dig deeper, it would help if we had time to introduce # other multivariate methods, such as canonical correlation analysis # (CCA) and generalized canonical analysis (GCA), but in the absence # let's introduce a general principle and illustrate it with PCA alone # even though it could be used to derive all other multivariate # methods based on eigen analyses. # The principle has two postulates as follows: # - Multivariate methods iys constrained optimization of variance # of lin. combs. # - The constraint can be derived by re-calculating the variance # under the assumption of absent correlations. For PCA, the # assumption is that ALL correlations are absent (whereas for # CCA and GCA it would be only a subset of correlations). # Executing this principle is as follows: # The criterion is Var(X b) = Var(x1 b1 + x2 b2 + ... + xp bp) # The constraint is the same variance but assuming Cov(xk,xk)=0: # Var(x1 b1 + x2 b2 + ... + xp bp) # = Var(x1) b1^2 + Var(x2) b2^2 + ... + Var(xp) bp^2 # Thus the Rayleigh quotient is: # # Var(x1 b1 + x2 b2 + ... + xp bp) # ------------------------------------------------ # Var(x1) b1^2 + Var(x2) b2^2 + ... + Var(xp) bp^2 # # Is optimizing this ratio the same as correlation-based PCA? # Indeed it is: If the variables are standardized to unit variance, # the denominator becomes the squared Euclidean length. # # We can't develop the power of this principle here, but at least # gives an indication that the denominator is also a variance, # and PCA is all about comparing two variances: # the observed one (numerator) and one calculated under a null hypothesis # of absent correlations (denominator). # In this sense, PCA is a way to detect the most egregious # deviations from the null assumption of absent correlations. ################ # # Another data example: Swiss fertility swiss.pca <- pca(swiss) # # 1) eigenvalue profile: plot(swiss.pca$Eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data") round(swiss.pca$Eval, 3) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 3.200 1.188 0.848 0.439 0.205 0.121 round(cumsum(swiss.pca$Eval)/sum(swiss.pca$Eval)*100, 1) eval 1 eval 2 eval 3 eval 4 eval 5 eval 6 53.3 73.1 87.3 94.6 98.0 100.0 # The first PC accounts for an astounding 53.3% of the variance! # Given the profile plot, one could say the data are largely # one-dimensional. # # 2) eigenvectors/loadings: round(swiss.pca$Load, 3) load 1 load 2 load 3 load 4 load 5 load 6 Fertility 0.817 0.351 -0.160 -0.355 0.173 0.164 Agriculture 0.759 -0.449 0.035 0.426 0.170 0.107 Examination -0.912 0.136 -0.084 0.036 0.368 -0.078 Education -0.813 0.195 0.490 0.065 -0.032 0.237 Catholic 0.626 0.159 0.743 -0.066 0.083 -0.140 Infant.Mortality 0.268 0.884 -0.147 0.349 -0.047 -0.026 # The first PC is a contrast of Fertility, Agriculture, # Catholicism versus Examination and Education. # This essentially reflects the positive correlations # between Fertility, Agriculture, Catholicism and the # negative corrs with Examination and Education. # # 3) PC projections: windows() par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) xlim <- ylim <- 1.1*range(swiss.pca$Proj) pairs.plus(swiss.pca$Proj, pch=16, cex=.5, gap=0, xlim=xlim, ylim=ylim) # Here the most interesting plot is Projection 1 versus 3, which # reveals natural clustering. plot(swiss.pca$Proj[,c(1,3)], xlim=xlim, ylim=ylim, pch=16) identify(swiss.pca$Proj[,c(1,3)], labels=rownames(swiss), cex=.8) # This reveals the clusters as basically protestant and catholic, # but Geneva all on its own in the top left, with Rive Gauche # and Rive Droite nearby. ################ # # Inference for PCA: bootstrap and permutation tests # # 1) A crude null hypothesis is that there is no correlation # among the variables, implying that the eigenvalue profile # is flat. This can be easily simulated with permutations # of the columns. Here is a spaghetti band for the null # eigenvalues: plot(swiss.pca$Eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data") for(i in 1:100) lines(pca(apply(swiss, 2, sample))$Eval, col="gray") points(swiss.pca$Eval, type="b") # The band is instructive because it reveals the natural chance # capitalization of the eigenvalues even in the total absence # of correlation in the underlying population. The randomness # in the finite sample introduces small spurious correlations # which PCA exploits. We see that a largest PC eigenvalue of # about 1.5 or even 1.8 would be entirely compatible with the # complete absence of correlations among the variables. # 2) Bootstrap: boot <- function(x) x[sample(1:nrow(x),repl=T),] plot(swiss.pca$Eval, pch=16, type="b", ylab="Eigenvalues of Mktg Data", ylim=c(0,4)) for(i in 1:100) lines(pca(boot(swiss))$Eval, col="gray") points(swiss.pca$Eval, type="b") # Unfortunately, the band cannot be expected to contain the true # eigenvalue profile, only the expected values of the eigenvalues # for this sample size. The problem is that the largest eigenvalues # are biased upward and the smallest biased downward due to chance # capitalization. # Think about it! The mere fact that we sort random entities # makes the largest biased upward, the smallest biased downward. ################################################################