######## # # Function that repackages 'kmeans()' so the initialization # is a random selection of points from the data matrix. # (There is also a slight jitter applied to the initialization # because 'kmeans()' barfs when there are exact ties, as # occur with non-zero probability when the data are bootstrap # samples.) # k.means <- function(x, K.or.align) { N <- nrow(x) p <- ncol(x) if(length(K.or.align)==1) { align <- NULL K <- K.or.align } else { align <- K.or.align K <- length(unique(align)) } km <- kmeans(x, iter.max=1000, centers = x[sample(N, size=K),] ) if(!is.null(align)) { # km <- align.kmeans(align, km) segs.u <- sort(union(align, km$cluster)) segs.n <- length(segs.u) seg <- km$clust tab <- table(factor(align, levels=segs.u), factor(km$clust, levels=segs.u)) ord <- rep(NA,segs.n) for(i in 1:segs.n) { max.tab <- max(tab) max.row <- row(tab)[tab==max.tab][1] max.col <- col(tab)[tab==max.tab][1] ord[max.col] <- max.row tab[max.row,] <- -Inf; tab[,max.col] <- -Inf } km$cluster <- ord[km$cluster] km$centers <- km$centers[order(ord),] # inverse mapping: order(ord) } return(km) } ######## # # Function to align new segmentations 'segs' with an old segmentation 'sego': # IMPORTANT: 'segs' can have multiple segmentations, one per ROW! # align.segments <- function(sego, segs, ret="seg") { # ret="ord", ret="both" if(is.null(dim(segs))) segs <- matrix(segs, nrow=1) segs <- as.matrix(segs) if(ncol(segs)!=length(sego)) { cat("align.segments(): incompatible sizes of segmentations \n"); stop() } # segs.u <- sort(union(sego, segs)) segs.n <- length(segs.u) nSegn <- nrow(segs) for(iseg in 1:nSegn) { seg <- segs[iseg,] tab <- table(factor(sego, levels=segs.u), factor(seg , levels=segs.u)) ord <- rep(NA,segs.n); names(ord) <- segs.u for(i in 1:segs.n) { max.tab <- max(tab) max.row <- row(tab)[tab==max.tab][1] max.col <- col(tab)[tab==max.tab][1] ord[max.col] <- max.row tab[max.row,] <- -Inf; tab[,max.col] <- -Inf } segs[iseg,] <- ord[seg] } if(ret=="seg") return(segs) else if(ret=="ord") return(ord) else return(list(seg=segs, ord=ord)) } # Experiments for debugging: #dat <- dat.all[,vars] #dat <- dat.all[,vars.seg] #sego <- segs.all #segs <- random.restart(dat, nsegs=7, nrestarts=1) #table(sego, segs) # original #table(sego, segs=align.segments(sego, segs)) # diagonally matched # # ######## # # Function to align list of kmeans outputs to a reference segmentation: # align.kmeans <- function(seg.ref, km.lst, verbose=200) { segs.u <- sort(unique(seg.ref)) segs.n <- length(segs.u) segs.f <- factor(seg.ref, levels=segs.u) if(!is.list(km.lst)) km.lst <- list(km.lst) for(iseg in 1:length(km.lst)) { seg <- km.lst[[iseg]]$clust tab <- table(segs.f, factor(seg, levels=segs.u)) ord <- rep(NA,segs.n); names(ord) <- segs.u for(i in 1:segs.n) { max.tab <- max(tab) max.row <- row(tab)[tab==max.tab][1] max.col <- col(tab)[tab==max.tab][1] ord[max.col] <- max.row tab[max.row,] <- -Inf; tab[,max.col] <- -Inf } km.lst[[iseg]]$clust <- ord[seg] km.lst[[iseg]]$cent <- km.lst[[iseg]]$cent[order(ord),] if(iseg%%verbose==0) cat(iseg," ") } cat("\n") if(length(km.lst)==1) return(km.lst[[1]]) else return(km.lst) } # try: # ctro <- mktg[sample(nrow(mktg),4),-9] # ctrn <- apply(ctro, 2, jitter, fac=.01) # as.matrix(dist(rbind(ctro, # align.centers(ctro, ctrn, ret="ctr"))))[1:K,1:K+K] # withinss <- function(dat, segs) { N <- nrow(dat) segs.u <- sort(unique(c(segs))) segs.n <- length(segs.u) centers <- matrix(NA, nrow=segs.n, ncol=ncol(dat)) withinss <- rep(NA, segs.n) sizes <- table(factor(segs, levels=segs.u)) for(i in 1:segs.n) { centers[i,] <- t(dat[segs==segs.u[i],]) %*% rep(1/sizes[i],sizes[i]) withinss[i] <- sum((t(dat[segs==segs.u[i],])-centers[i,])^2) } return(withinss) } ## sum(withinss(dat, mktg[,9])) ## sum(withinss(scale(mktg[,-9]), mktg.km.lst[[1]]$cluster)) ## sum(mktg.km.lst[[1]]$within) ################################################################