#--------------------------------------------------------------------------- # # Cluster Analysis: pretend we don't know anything else about wines # #--------------------------------------------------------------------------- # install.packages("gmodels", dependencies=T) # contingency table # install.packages("cluster", dependencies=T) # cluster plots # --- file references relative to this path setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") require(stringr) require(tm) require(gmodels) require(cluster) # --- recover data range(slimFreq) slimTypes[1:10] slimTypes[length(slimTypes)-(0:9)] # --- 3 clusters set.seed(33) # random starting points k3 <- kmeans(as.matrix(slimDTM), centers=3, nstart=5); use.prop=F k3 <- kmeans( prop , centers=3, nstart=5); use.prop=T names(k3) length(k3$cluster) # which cluster dim(k3$centers) # cluster centroids (k x ncol) c(k3$totss, NA, k3$withinss, NA, k3$tot.withinss, k3$betweenss) table(k3$cluster) table(Wine$color, k3$cluster) CrossTable(Wine$color, k3$cluster, prop.t=F,chisq=T, format='SAS') clusplot(as.matrix(slimDTM), k3$cluster, color=TRUE, shade=TRUE, labels=4, lines=1) # --- 5 clusters # a red and a white cluster become apparent set.seed(555) k5 <- kmeans(as.matrix(slimDTM), centers=5, nstart=5); use.prop=F set.seed(555) k5 <- kmeans( prop , centers=5, nstart=5); use.prop=T matrix(c(k3$totss, NA, k3$withinss, NA, NA, NA, k3$tot.withinss, k3$betweenss, k5$totss, NA, k5$withinss, NA, k5$tot.withinss, k5$betweenss), nrow=2, byrow=T) table(k5$cluster) CrossTable(Wine$color, k5$cluster, prop.t=F,chisq=T, format='SAS') # labels if(use.prop) {iRed <- 2; iWht <- 5} else {iRed <- 4; iWht <- 2} redCluster <- k5$cluster == iRed whtCluster <- k5$cluster == iWht Wine[ which(redCluster)[1:10], "label"] Wine[ which(whtCluster)[1:10], "label"] # this visualization doesn't help clusplot(as.matrix(slimDTM), k5$cluster, color=TRUE, shade=TRUE, labels=4, lines=1) # means red <- k5$centers[iRed,] wht <- k5$centers[iWht,] ss <- sqrt(red^2+wht^2) i <- order(ss, decreasing=T)[1:25] plot(red, wht, col='gray', xlab="Red Centroid", ylab="White Centroid") text(red[i], wht[i], names(red[i]), cex=0.7) # --- 8 clusters # two clusters appear more distinct set.seed(853) k8 <- kmeans(as.matrix(slimDTM), centers=8, nstart=5); use.prop=F set.seed(853) k8 <- kmeans( prop , centers=8, nstart=5); use.prop=T matrix(c(k3$totss, k3$tot.withinss, k3$betweenss, k5$totss, k5$tot.withinss, k5$betweenss, k8$totss, k8$tot.withinss, k8$betweenss), nrow=3, byrow=T) table(k8$cluster) CrossTable(Wine$color, k8$cluster, prop.t=F,chisq=T, format='SAS', digits=2) if(use.prop) {iRed <- 8; iWht <- 5} else {iRed <- 3; iWht <- 2} redCluster <- k5$cluster == iRed whtCluster <- k5$cluster == iWht Wine[ which(redCluster)[1:10], c("color","label")] Wine[ which(whtCluster)[1:20], c("color","label")] # means look similar to 5 solution (with proportions) red <- k8$centers[iRed,] wht <- k8$centers[iWht,] ss <- sqrt(red^2+wht^2) i <- order(ss, decreasing=T)[1:25] plot(red, wht, col='gray', xlab="Red Centroid", ylab="White Centroid") text(red[i], wht[i], names(red[i]), cex=0.7) # lengths not terribly relevant between red and wht clusters boxplot(str_length(Wine$description) ~ as.factor(k8$cluster)) # ratings differ as well boxplot(Wine$points ~ as.factor(k8$cluster), xlab="Cluster", ylab="Rating Points") red <- k8$centers[iRed,] vg <- k8$centers[2,] # adjust ss <- sqrt(red^2+vg^2) i <- order(ss, decreasing=T)[1:25] plot(red, vg, col='gray', xlab="Red Centroid", ylab="Highly Rated Centroid") text(red[i], vg[i], names(red[i]), cex=0.7) # --- wordcloud using means require(wordcloud) range(red) par(mfrow=c(1,2)) wordcloud(names(red), floor(100*red/max(red)), min.freq=3) wordcloud(names(wht), floor(100*wht/max(wht)), min.freq=3) par(mfrow=c(1,1)) # time effect? plot(redCluster) plot(whtCluster) plot(Wine[,"color"]=="Red")