################################################################ # # K-MEANS CLUSTER ANALYSIS # # The K-means clustering algorithm tries to divide the data # into a pre-specified number of groups (K) in such a way # that the groups are somehow homogeneous and separated # from each other if possible. # # The algorithm works by placing K tentative cluster centers # into data space. The algorithm then moves the centers # around till a certain criterion is minimized. # The criterion is the sum of squared distances to the # nearest cluster center: # # sum_{i=1..N} min_{j=1..K} || x_i - m_j ||^2 = min_{m_1,...,m_K} # # where m_1,..., m_K are the cluster centers in p-space # and x_1,..., x_N are the N data vectors/rows/cases/units/records. # Obviously, the k-means clustering algorithm assumes quantitative # variables. # About the algorithm: How are the centers m_j moved around? # # Here some heuristics: Each data row x_i "belongs" to one of the # m_j, namely, the one that is nearest to x_i. # Now turn things around: m_j has a collection of x_i's that # "belong" to it because m_j is the nearest center. # # Now hold the collection of these x_i's fixed and ask: where should # m_j be located to minimize the criterion? Answer: m_j should be # the mean of "its" x_i's. That's because we are using a sums of # square criterion! # # So let's move m_j to the mean of "its" x_i's, do it for all # centers m_j. Having done this, we should wonder whether the # assignments of x_i's to nearest m_j's might not have changed. # In fact, in will have changed, so we should do a re-assignement # of the x_i's. # # Now you see how the game repeats: # Given centers m_j, assign the x_i's to nearest m_j. # Replace the m_j's with the means of "their" x_i's. # Having new centers m_j, re-assign the x_i's to nearest m_j. # ... till convergence. # # Is k-means a "clustering" algorithm? Maybe it is better to think # of it as a "partioning" algorithm. It partitions the data into # as many groups as you tell it to. Let's not ask the question # of the "correct number of clusters" yet. In fact, k-means is # often applied when there are no clusters at all. In marketing, # for example, clustering really means "segmentation", that is, # carving up into homogeneous segments, not natural clusters. # There may exist better methods for finding natural clusters # than k-means. # # Example in R: # Function: 'kmeans()' # Data: telecom marketing # We use a random selection of K=4 rows as initialization of # the cluster centers. # As in PCA, one should standardize the variables unless they # have the same units: dat <- scale(mktg[,-9]) # '-9': Use only the real variables. N <- nrow(dat) p <- ncol(dat) K <- 4 # decide on 4 clusters/segments mktg.km <- kmeans(dat, iter.max=1000, centers = dat[sample(N, size=K),] ) # Note that the number of clusters is implicit in the dimensions of # 'centers'; no need for a special argument for K. # # Examine the returned data structure: names(mktg.km) # The following lists the sizes of the 'clusters': mktg.km$size # The cluster centers are rows of the following matrix: mktg.km$centers # The assignment of the rows of 'mktg' to the clusters is in: mktg.km$cluster # But this grouping variable should be redundant with the size # component: table(mktg.km$cluster); mktg.km$size # Indeed. # # In what follows we will use a repackaged function that # picks K random (!) rows as initializations: url.buja <- "http://www-stat.wharton.upenn.edu/~buja/STAT-541/" source(paste(url.buja,"segs.R",sep="")) # Its call is 'k.means(x, K=4)'. (Look at the content.) # It has functions also align multiple clusterings choosing labels as # best matched as possible to each other. # # We'll look at the clusters in the first four principal component # coordinates computed earlier: windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) xlim <- ylim <- range(mktg.pca$Proj) pairs(mktg.pca$Proj[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg.km$cluster]) # our clustering:^^^^^^^^^^^^^^^ # Now this looks different from the original clustering: windows(10,10); par(mgp=c(1.8,.5,0), mar=c(3,3,2,1)) pairs(mktg.pca$Proj[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[mktg[,9]]) # the original clustering:^^^^^^^^ # Now let us compare a sequence of clusterings (we now omit the axes): par(mfrow=c(4,4), mar=rep(.5,4)) for(i in 1:16) { clust <- k.means(scale(mktg[,-9]), K=4)$cluster # random restarts plot(mktg.pca$Proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[clust]) } # There is a distraction in that the clusters are not matched in # colors. This can be helped: our 'k.means()' function can be given a # clustering vector, and the newly created solution will be # immediately aligned with the given clustering. This is done by # giving 'k.means()' the argument 'K=clustering.to.be.aligned.with'. # In fact, the full name of this argument is 'K.or.align'. If this # argument is given, then the number of clusters K is inferred from it. # # Here are the original clustering with 15 randomly initialized # clusterings aligned with the original clustering: par(mfrow=c(4,4), mar=rep(.5,4)) plot(mktg.pca$Proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[mktg[,9]]) # orig. seg. for(i in 1:15) { # alignment below... clust <- k.means(dat, K.or.align=mktg[,9])$cluster # align clusters!!! plot(mktg.pca$Proj[,1:2], pch=16, cex=.3, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab="", ylab="", col=c("red","green","blue","brown")[clust]) } # So there seems to exist a degree of dependence on the # initialization! Comparing the 16 plots, however, it appears that # there are several clusterings that look identical. So maybe there # are only a few discrete local solutions for the kmeans criterion? # # If the k-means algorithm finds different solutions, how do they # perform in terms of the k-means criterion? Let's collect a # list of 1000 clusterings and analyze the criterion values: mktg.km.lst <- list() for(i in (length(mktg.km.lst)+1):1000) { if(i%%100==0) cat(i,"") # Intermediate messages mktg.km.lst[[i]] <- k.means(dat, K=mktg[,9]) } # (Among many calls to kmeans() it can happen that one of them # crashes; rerun, and it will continue from where it crashed.) # # Peel off the criterion values: mktg.km.crit <- sapply(mktg.km.lst, function(x) sum(x$withinss)) # (Some useful functions to spare you writing loops: # sapply(), lapply(), tapply().) # Look at them with a histogram that uses very fine bins so we # can almost see individual values: windows() hist(mktg.km.crit, breaks=10000, col="gray", xlim=c(9100,9700)) # There are discrete sets of values that appear many times over!! # In fact, there are only a couple dozen different values of the # criterion: length(unique(mktg.km.crit)) [1] 25 # Let's collect them, sorted from smallest (best) to largest (worst): mktg.km.crit.uniq <- sort(unique(mktg.km.crit)) # and tabulate them: table(mktg.km.crit) 9240.92733734854 9313.91364263915 9313.91740353386 9313.97250010717 277 114 34 80 9314.0971039678 9319.76071481234 9319.8238068561 9319.91076889382 116 172 31 1 9336.89070097278 9336.90739225535 9336.9346563692 9370.29454375176 48 15 25 6 9370.4148403249 9377.17397780075 9377.31085261402 9449.83056805507 5 17 1 11 9450.00997410455 9450.41265146235 9487.3014103005 9487.32581863884 8 11 8 6 9488.15924588206 9491.312016306 9530.79434513218 9530.79668858917 1 2 5 4 9569.88679967423 9577.3159323069 1 1 # Turns out some of the values are nearly identical, such as the # 2nd, 3rd and 4th. # Could the values correspond to different geometric patterns of the # cluster centers? How could we confirm this hunch? Let's plot # in a 4x4 matrix layout again, but for one particular value # of the criterion at a time. Execute the following repeatedly, # but pick different values from the vector 'mktg.km.crit.uniq': icrit <- 1 # try for 1, 2, 3, ...; crit vals sorted, best first! crit <- mktg.km.crit.uniq[icrit] par(mfrow=c(4,4), mar=rep(.5,4)) xlim <- ylim <- range(mktg.pca$Proj[,1:2]) for(i in 1:16) { clust <- mktg.km.lst[mktg.km.crit==crit][[i]]$cluster plot(mktg.pca$Proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust], xaxt="n", yaxt="n", xlab="", ylab="") }; cat("icrit =",icrit," crit =",crit,"\n") # Interestingly, icrit=1, with crit = 9240.927, produces a different # pattern compared to the original segmentation in column 9 ! # A pattern like column 9 is found for icrit=2 with crit = 9313.914, # which is only second best. How come the analysts who produced # the segmentation of column 9 did not hit on the "best" clustering # in terms of the criterion? They probably did but preferred the # "second best" pattern anyway, most likely due to interpretability. # (In a moment we will see that the column 9 pattern is actually # worse: it belongs to the 5th largest criterion value...) # # Next question: are the cluster assignments identical for all # clusterings with the same criterion value? So far the plots sure # look like it. In the following code we look at each criterion value # at a time, and for each value we check whether the cluster # memberships are identical. Realize that this exercise makes sense # only because we aligned the clusterings. Without alignment, # identical clusterings would differ by a random permutation of the # cluster numberings. for(icrit in 1:length(mktg.km.crit.uniq)) { crit <- mktg.km.crit.uniq[icrit] # unique sorted crit values sel <- mktg.km.crit==crit # the solutions with criterion==crit if(sum(sel)>1) { # nothing to compare if only one clustering clust1 <- mktg.km.lst[sel][[1]]$cluster # first = reference for(i in 2:sum(sel)) # compare the others with the reference if(any(clust1 != mktg.km.lst[sel][[i]]$cluster)) cat("differences found for",icrit,":", # show if discrepant sum(clust1 != mktg.km.lst[sel][[i]]$cluster),"\n") }} # No complaint, all are identical clusterings in terms of case # assignments! Hence we can limit ourselves to one clustering per # criterion value: mktg.km.lst.red <- list() # initialize a reduced list for(icrit in 1:length(mktg.km.crit.uniq)) { crit <- mktg.km.crit.uniq[icrit] # For this criterion value... sel <- mktg.km.crit==crit # pick its clusterings and ... mktg.km.lst.red[[icrit]] <- mktg.km.lst[sel][[1]] # pick the 1st } # Check: length 30 (can vary with simulation) length(mktg.km.lst.red) # We lost the information about the frequency with which each # clustering appeared in the 1000 random restarts, but this is not # essential for what follows. From now on we're working with this # reduced set that has redundant clusterings removed. Here is a plot # of the first 16 of the 26 clusterings in the first 2 PCs: par(mfrow=c(4,4), mar=rep(.5,4)) xlim <- ylim <- range(mktg.pca$Proj[,1:2]) for(i in 1:16) { crit <- mktg.km.crit.uniq[i] clust <- mktg.km.lst.red[[i]]$cluster plot(mktg.pca$Proj[,1:2], pch=16, cex=.2, xlim=xlim, ylim=ylim, xlab="", ylab="", xaxt="n", yaxt="n", col=c("red","green","blue","brown")[clust]) text(xlim[2], ylim[2], adj=c(1,1), lab=paste("crit:",round(crit,4))) text(xlim[2], ylim[1], adj=c(1,0), lab=paste("freq:",sum(mktg.km.crit==crit))) } # ^^^^^^^^^^^^^^^^^^^^^^^ # freq of this crit value in multiple restarts # # Well, the best solution with criterion ~ 9241 is numerically a # league better than the others, but next few patterns look very # similar to each other # # Let us now check which of the 26 clusterings has the same partitions # as the colum 9 segmentation by counting the mismatches in the # cluster numberings. Recall that this works because mktg[,9] is the # reference for alignment of the numberings: mktg.km.mis.9 <- sapply(mktg.km.lst.red, function(x) sum(x$cluster != mktg[,9])) names(mktg.km.mis.9) <- round(mktg.km.crit.uniq,4) # crit vals = names mktg.km.mis.9 # So, it is only the 5th that is identical to the column 9 clustering! # Clusterings 2, 3, 4 are close, though, expecially in view of N=1926. # Let us get a more complete picture by forming a table of mismatch # counts of all pairs of clusterings: mktg.km.mis <- matrix(0, length(mktg.km.crit.uniq), length(mktg.km.crit.uniq)) for(i in 1:length(mktg.km.crit.uniq)) for(j in 1:length(mktg.km.crit.uniq)) mktg.km.mis[i,j] <- sum(mktg.km.lst.red[[i]]$cluster != mktg.km.lst.red[[j]]$cluster) # We can make out patterns by going down the diagonal where we notice # square blocks of small numbers that indicate groups of clusterings # that are very close to each other. Here is the 13x13 square from # the top left: mktg.km.mis[1:13,1:13] [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [1,] 0 810 809 806 814 791 794 823 821 821 836 835 [2,] 810 0 3 4 10 104 105 492 490 494 604 615 [3,] 809 3 0 7 7 101 102 491 489 493 603 614 [4,] 806 4 7 0 14 108 109 491 489 493 607 618 [5,] 814 10 7 14 0 94 95 490 488 492 600 611 [6,] 791 104 101 108 94 0 6 429 433 437 546 557 [7,] 794 105 102 109 95 6 0 424 428 432 541 552 [8,] 823 492 491 491 490 429 424 0 6 10 207 218 [9,] 821 490 489 489 488 433 428 6 0 4 205 216 [10,] 821 494 493 493 492 437 432 10 4 0 202 213 [11,] 836 604 603 607 600 546 541 207 205 202 0 11 [12,] 835 615 614 618 611 557 552 218 216 213 11 0 #---------------- # We see that the first and best pattern is nothing like anything else. # The next 4 patterns form a block and are all close to each other. # Then comes a block of 3 patterns that are very close, # then another block of 3 patterns that are very close. # Did you notice what we did? We clustered the clustering patterns... # In light of this table it shouldn't be a surprise that clusterings # 2-5 looked about the same. Actually, even clusterings 6-9 have # small mismatch numbers with clusterings 2-5 percentagewise, # and even they look all alike. The next group of clusterings, 9-11, # look alike but different from everything else. # Summary points so far: # 1) K-means analysis has an issue (not a problem) of local minima, # and one needs methodology to get an overview of them: # collecting and analyzing many multiple restarts. # The analysis we used consisted of # a) listing and sorting the unique values of the criterion # b) checking that all clusterings for a given criterion value # are identical # c) plotting the clusterings of each criterion value as # colors in the first two principal components of the data # d) computing a mismatch matrix for all pairs of clusterings # e) comparing the mismatch matrix with the similarities among # the colored PC projection plots # 2) It seems the "best" clustering is not always the most # interpretable. At least that's what is suggested by the fact that # the column 9 segmentation that comes with the data is not # numerically optimal. Yet, the marketers who decided in its favor # most likely had interpretability in mind in their "suboptimal # choice". # # 3) For comparing clusterings, one needs to align the numberings # of clusters across clusterings. But for alignment, one needs a # reference clustering. For the mktg data we used the column 9 # clustering as reference, but in other situations one may first # have to play with clusterings before one picks a reference for # aligning the other clusterings. ################ # # We continue with a systematic simulation study of the geometry of # kmeans clusterings and their (in)stabilities. The example above # gave us the heuristic idea that the action of k-means is in the # largest principal component subspaces, that is, in very few # dimensions that actually matter. Intuitively this makes sense # because an algorithm that should slice a pie is not going to slice # it over the edge, but more likely wielding a knife looking from # above. Hence to train intuitions we can play with k-means using few # variables, and imagine that these are the principal component # subspaces. # # Consider normal 2-D data with pre-defined correlations. # Plot the kmeans solutions for K=4 for varying correlations. N <- 20000 x1 <- rnorm(N); xa <- rnorm(N) # start with uncorrelated variables par(mfrow=c(6,6), mar=rep(.1,4))# for(co in seq(.00, .99, length=36)) { # loop over 36 correlations ## for(co in seq(.6, .7, length=36)) { # zoom in on switch of pattern xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa)) # xlim <- ylim <- 3.5*(c(-1,1)) clust <- kmeans(xx, xx[1:4,])$cluster plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") text(x=-3.4, y=3.4, lab=round(co,8), adj=0, cex=.8) } # Something happens between correlation 0.6 and 0.7: # an abrupt switch from one geometric pattern to another. # One could think of the pattern for lower correlations # as slicing a pie, for higher correlations as slicing # a cigar. # # If in a practical situation the data are correlated # in a way that is conducive to multiple near-optimal patterns, # they can be found with multiple restarts, as we did with # the marketing data. This shows that these patterns are # indeed local minima of the kmeans criterion: co <- 0.648 # (near the switch of patterns for current x1 and xa) xx <- scale(cbind(x1,co*x1+sqrt(1-co^2)*xa)) for(irestart in 1:36) { clust <- k.means(xx, K=4)$cluster # our function plot(xx, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") } # # Q: What are the implications for statistical inference # for kmeans clustering? # Does bootstrap, for example, makes sense near correlation 0.6? # What is bootstrap going to tell you? # Let's try: pick a correlation near the switch point, and bootstrap. for(iboot in 1:36) { xb <- scale(xx[sample(N, replace=T),]) clust <- kmeans(xb, xb[1:4,])$cluster plot(xb, xlim=xlim, ylim=ylim, pch=".", cex=0.5, col=c("red","green","blue","brown")[clust], xlab="", ylab="", xaxt="n", yaxt="n") } # Indeed, bootstrap keeps flip-flopping between patterns, # with a strange attempt at a transition pattern at times. # Possible lesson: Yes, do bootstrap, but beware that the # multiplicity of solutions might be of a geometric nature. # # Come to think of it, we found multiple patterns in the # telecom marketing data also, just with multiple restarts! # The pattern had the colors blue and brown overlapping, # indicating that it needed a third dimension to accommodate # its four cluster centers: xlim <- ylim <- range(mktg.pca$Proj[,1:2]) icrit <- 1 clust <- mktg.km.lst.red[[icrit]]$cluster pairs.plus(mktg.pca$Proj[,1:4], pch=16, cex=.2, xlim=xlim, ylim=ylim, col=c("red","green","blue","brown")[clust]) # It appears that blue and brown are best separated in the positive # diagonal direction of PCs 3 and 4, a fact that is somewhat obscured # by the green and red clusters that are smudged over everything. # ##### # # Summary points: # # 1) Patterns in k-means clusterings have to do with the # geometry of the point scatter. If the point scatter is just a # multivariate normal blob, the geometry is fully described by the # covariance structure. One should therefore not be surprised if # there is a connection between PCA and k-means partitions. # Both respond to the geometry of the best fitting ellipsoid # round the data. # # 2) For normal (Gaussian) point scatters it is quite plausible # that the k-means solutions are fully determined by the # covariance structure. A little more surprising is that # this seems to hold for such instances as the marketing # data as well, where most of the variables are discrete. # Who would have expected a pie-slicing pattern in the # first two PCs very similar to the bivariate normal simulation # with correlations below 0.6. # 3) Note that we said "covariance structure". In fact, # k-means is a metric method in that it relies on # Euclidean distances. Depending on how the variables # are scaled, one can get very different k-means results. # The issue should become clear by imagining data with two # variables whose standard deviations are 1 and 100, respectively. # K-means will consider this as a cigar and slice it on the # second variable, never mind the correlation. If one wants the # correlation structure to drive the clustering, one should # scale all variables to unit variance, so covariance and # correlation are identical. In general, one should make # sure the variables are on comparable scales, and the # conventional 'equalization' of scales is by standardization # of the variables (z-scores: subtract from each variable its mean, # divide it by its stdev). #### # # Interpretation of cluster centers: # # Like regression coefficients and PCA loadings, # the kmeans centers are a major source for interpretation. # In what follows we plot cluster centers as profiles in # what is known as a "parallel coordinate plot": windows(wid=16, hei=12) par(mgp=c(1.8,.5,0), mar=c(2,2,1,1), mfcol=c(5,2)) for(icrit in 1:10) { # number of pattern; recall: icrit=5 ~ mktg[,9] crit <- mktg.km.crit.uniq[icrit] mktg.ctrs <- mktg.km.lst.red[[icrit]]$centers round(mktg.ctrs, 2) plot(x=c(.5,8.5), y=range(mktg.ctrs), type="n", xaxt="n", xlab=NA, ylab=NA) axis(side=1, at=1:ncol(mktg.ctrs), lab=colnames(mktg.ctrs)) lines(x=c(rbind(1:ncol(mktg.ctrs),1:ncol(mktg.ctrs),NA)), y=rep(c(par()$usr[3:4],NA),ncol(mktg.ctrs)), col="gray95") lines(par()$usr[1:2], c(0,0), col="gray95") for(i in 1:nrow(mktg.ctrs)) lines(mktg.ctrs[i,], col=c("red","green","blue","brown")[i]) text(x=par()$usr[2], y=par()$usr[3], lab=icrit, adj=c(1.5,-0.4)) text(x=par()$usr[2], y=par()$usr[4], lab=round(crit,3), adj=c(1.05,1.2)) } # The center coordinates indicate 'where' the cluster sits. # Judge their meaning in terms of: # - the first three variables: call volume # - the next three variables: dummies for needs/lifestyles # - age # - education # Which of these sets of centers is most interpretable? # Recall that we have natural groups of clusterings: # 1 by itself, 2-5 pretty much like mktg[,9], 6-8 also similar, # 9-10 different from all else. # (clustering 1 is interesting, and new to us: # red group by itself makes high volums, all else low vol; blue # high on business; green old age with low vol and no features) #### # # Summary points about K-means clustering: # # - K-means is more often than not used to segment # data that do NOT fall into natural groups. # Segmenting of ungrouped data can be meaningful. # BUT: It would be wrong and dishonest to say "We found groups". # We didn't find them; we made them up for convenience. # # - K-means arranges cluster centers in geometric patterns; # these patterns are local minima of the K-means criterion. # # - Which pattern solution one prefers depends not only on # the K-means criterion value but also on interpretability. # # - Flip-flopping between patterns can be an impediment # to statistical inference: one could conceive of # confidence regions for the cluster centers given # a fixed pattern, but the presence of multiple patterns # sabotages this idea. # # - PCA and K-means are naturally connected. In fact, there # exists theory according to which the K-means centers of # multivariate normally distributed data fall in PC # subspaces of large eigenvalues (Tarpey & Flury). # Therefore, plotting the first few PC projections with # color coded clusters may reveal the K-means patterns. # #### # # Open questions: # # We have not addressed the choice of 'K', the number of clusters. # There exists a literature on this topic, repleat with # quasi-F-tests and the like. I find these formal methods mostly # irrelevant or even misleading. If there are natural groups, you # should try to see them in a dimension reduction, such as a # scatterplot matrix of a few largest principal component # projections. If this doesn't work try a non-linear dimension # reduction method such as multidimensional scaling (MDS). [Ask # the instructor about MDS; he loves to talk about it.] # In soft sciences such as marketing and social sciences in # general, there are rarely natural clusters present, hence # clustering is really segmentation, and the question of 'correct # number of clusters' does not exist. If anything, one should # wonder whether the chosen number of clusters is compatible with # the geometry of the data cloud in p-space. For example, I find # K=5 generally not congenial for situations where the first two # PCs cover a majority of variance. K-means then carves an # ellipsoid in 2-D into 5 slices, which just doesn't seem right # because it violates the symmetry of an ellipsoid. If the data # projected onto the first two PCs looks not ellipsoidal but more # like a polygon with an odd number of corners, then slicing into # 3 or 5 might be sensible. # ################ # # * Other types of clustering: hierarchical methods # # A very different type of cluster analysis is based on the idea # of hierarchical trees. Their purpose is to find natural groups, # unlike K-means, which is more often used for segmentation, that is, # carving up data that don't have natural groups. # Hierarchical methods work in a bottom up fashion: # they recursively form partitions of the data by # joining groups that are near each other in some sense. # They are also called 'agglomerative' because they # recursively form unions of pairs of groups. # Here are the steps: # - The input is a distance matrix of size NxN. # Each entry is a proximity measure for a pair of points. # - One initializes the algorithm with the finest possible # partition in which every data point is its own group. # - One then joins the nearest two points to form the first # non-trivial group. # - Next one computes a distance measure of this two-point group # from all other one-point groups. This way one obtains an # (N-1)x(N-1) distance matrix. # - Now repeat: join the two nearest elements, which may be joining # another two points into a two-point group, or the existing # two-point group with another point into a three-point group. # - Which ever way, one again computes a distance of the newly # merged group from all others, resulting in an (N-2)x(N-2) # distance matrix. # - Recurse... # The output of this algorithms is represented as a hierarchical # tree which depicts all mergers of groups. # # Here is an example: again, one should scale the data first, then... swiss.dist <- dist(scale(swiss)) # see: help(dist) swiss.hclust <- hclust(swiss.dist) # see: help(hclust) plot(swiss.hclust) plot(swiss.hclust, hang=-1, main="", sub="", ylab="", xlab="") # (We choose the arguments for a less cluttered views.) # # The function 'hclust()' offers seven methods that # differ in how they form the reduced distance matrix after # each merger step: hclust.methods <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid") # We'll try them out before we give more explanations: windows(12,8) par(mfrow=c(3,3), mgp=c(1.8,.5,0), mar=c(2,3,2,1)) for(meth in hclust.methods) plot(hclust(swiss.dist, method=meth), hang=-1, xlab="", main=meth, cex=.45) # # Here is the explanation from 'help(hclust)': # - WARD''S MINIMUM VARIANCE method aims at finding compact, spherical # clusters. # - The COMPLETE LINKAGE method finds similar clusters. # - The SINGLE LINKAGE method (which is closely related to the minimal # spanning tree) adopts a 'friends of friends' clustering strategy. # - The other methods can be regarded as aiming for clusters with # characteristics somewhere between the single and complete link # methods. # Here are details for some of the methods: # for two existing clusters A and B, the methods compute the # distance d(A,B) as follows. # - SINGLE LINKAGE: the minimum distance between points in A and B # - COMPLETE LINKAGE: the maximum distance between points in A and B # - AVERAGE LINKAGE: the mean of distances between points in A and B # - WARD'S METHODS: the mean of the squared distances between all # points in the union of A and B # - MEDIAN LINKAGE: the median of distances between points in A and B # These methods, except for the last, have simple updating rules # that facilitate the recursion: # - SINGLE LINKAGE: d(AuB,C) = min(d(A,C),d(B,C)) # - COMPLETE LINKAGE: d(AuB,C) = max(d(A,C),d(B,C)) # - AVERAGE LINKAGE: d(AuB,C) = (N_A*d(A,C)+N_B*d(B,C))/(N_A+N_B) # - WARD'S METHODS: V(A) := SS(A)/N_A/(N_A-1)/2 # d(A,B) := V(AuB) # SS(AuB) = SS(A)+SS(B)+SS(A,B) # hence one keeps track of all SS(A)'s and # all SS(A,B)'s and collects them in a diagonal # Which method gives the most useful results depends on the data # and on the purpose. # # As mentioned, a major purpose of these methods is to # find natural groups in multivariate data. Whether # the 'clusters' found are natural or not should be # checked graphically. To this end, one needs a method # for cutting a clustering tree at a given level. The # function 'cutree()' serves for this purpose: it allows # us to cut at a given 'height' of the tree (on the vertical # axis of the tree plot), or for a given number of clusters. # We cut the 'swiss' fertility data into 6 clusters and # check what the various methods produce. # First we plot the tree once more: imeth <- 4 meth <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid")[imeth] # play with method swiss.hclust <- hclust(swiss.dist, method=meth) windows(8,8) plot(swiss.hclust, hang=-1, main=meth, xlab="", ylab="", sub="", cex=.75) # Cut the tree so we obtain six groups: swiss.grps <- cutree(swiss.hclust, k=6) # Tabulate so we know the sizes of the groups: table(swiss.grps) # Matrix-plot the data with color-coding of the groups: swiss.grps <- order(order(table(swiss.grps)))[swiss.grps] clrs <- c("black","brown","darkgreen","blue","red", "darkorange","magenta") windows(10,10) pairs.plus(swiss, col=clrs[swiss.grps], main=meth, cex=.5) # (The first line is arcane: it orders the groups so their # number id is matched to colors: small groups in dark colors, # large ones in bright colors:) table(swiss.grps) # # Some comments on our experiences with these methods on this data: # - Ward's method does not work; there are no spherical clusters. # - 'Complete linkage' does a reasonable job: it isolates Geneva # and splits both catholics and protestants into two groups. # - 'Single linkage' does an extremely meaningful job: it # separates catholics and protestants in two groups and # isolates 3 singletons and one pair. # Single linkage is known to 'suffer' from 'chaining', that is, # clusters it creates can consist of long strands of points. # Maybe this isn't a real disadvantage. # Remaining mystery: what makes La Vallee so special?... ##### # # Some artificial data with natural clusters, so we can find # out whether either K-means or hierarchical clustering finds them. # The following construction puts standard normal clusters at # 4 times the unit vectors (the vertices of the natural simplex): K <- 6 # number of natural clusters N <- K*100 # number of points per cluster = 100 p <- 6 # number of variables s <- 4 # cluster separation parameter: # center dists = s*sqrt(2)=5.66 clusdat <- matrix(rnorm(N*p), ncol=p) clusdat <- clusdat + s * ( ((row(clusdat)-1)%%K) == (col(clusdat)-1) ) pairs.plus(clusdat, cex=.3) # Make it clear that in each frame we are supposed to see two detached # clusters (top and right), and the four others plotted on top of each # other in the bottom left. # # - Does K-means find the clusters? clusdat.km <- k.means(clusdat, K=6) pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3) # Great! K-means finds them, if we give it the proper cluster number # What if we give it the wrong number? clusdat.km <- k.means(clusdat, K=2) pairs.plus(clusdat, col=clrs[3+clusdat.km$cluster], cex=.3) # K=2: Lumps clusters together. # What about too many clusters? clusdat.km <- k.means(clusdat, K=7) pairs.plus(clusdat, col=clrs[clusdat.km$cluster], cex=.3) # some clusters are merged, others split... # K-means seems to be sensitive to choice of K. # # - Do hierarchical methods find the clusters? imeth <- 4 # play with method meth <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid")[imeth] clusdat.hc <- hclust(dist(clusdat), method=meth) #windows(8,8) plot(clusdat.hc, hang=-1, main=meth, xlab="", ylab="", sub="") # Indeed, they all do. Seems the problem was too easy. # We could reduce the separation of the centers, # which was large: 4*sqrt(2)=5.66 # At least there is comfort in the fact that clustering methods # do find the obvious natural clusters. # # ################################################################