#---------------------------------------------------------------------------
#
#	Cluster Analysis: pretend we don't know anything else about wines
#
#---------------------------------------------------------------------------
# install.packages("gmodels", dependencies=T) # contingency table
# install.packages("cluster", dependencies=T) # cluster plots

# --- file references relative to this path
setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")

require(stringr)
require(tm)
require(gmodels)
require(cluster)

# --- recover data


range(slimFreq)
slimTypes[1:10]
slimTypes[length(slimTypes)-(0:9)]

# --- 3 clusters
set.seed(33)  		# random starting points
k3 <- kmeans(as.matrix(slimDTM), centers=3, nstart=5); use.prop=F
k3 <- kmeans(  prop            , centers=3, nstart=5); use.prop=T

names(k3)
length(k3$cluster)	# which cluster
dim(k3$centers)		# cluster centroids (k x ncol)
c(k3$totss, NA, k3$withinss, NA, k3$tot.withinss, k3$betweenss)

table(k3$cluster)
table(Wine$color, k3$cluster)

CrossTable(Wine$color, k3$cluster, prop.t=F,chisq=T, format='SAS')

clusplot(as.matrix(slimDTM), k3$cluster, color=TRUE, shade=TRUE, labels=4, lines=1)

# --- 5 clusters
#		a red and a white cluster become apparent
set.seed(555)
k5 <- kmeans(as.matrix(slimDTM), centers=5, nstart=5); use.prop=F
set.seed(555)
k5 <- kmeans(   prop           , centers=5, nstart=5); use.prop=T

matrix(c(k3$totss, NA, k3$withinss, NA, NA, NA, k3$tot.withinss, k3$betweenss,
         k5$totss, NA, k5$withinss,         NA, k5$tot.withinss, k5$betweenss),
       nrow=2, byrow=T)

table(k5$cluster)

CrossTable(Wine$color, k5$cluster, prop.t=F,chisq=T, format='SAS')
#		labels
if(use.prop) {iRed <- 2; iWht <- 5} else {iRed <- 4; iWht <- 2}
redCluster <- k5$cluster == iRed
whtCluster <- k5$cluster == iWht
Wine[ which(redCluster)[1:10], "label"]
Wine[ which(whtCluster)[1:10], "label"]

#		this visualization doesn't help
clusplot(as.matrix(slimDTM), k5$cluster, color=TRUE, shade=TRUE, labels=4, lines=1)

#		means
red <- k5$centers[iRed,]
wht <- k5$centers[iWht,]
ss  <- sqrt(red^2+wht^2)
i   <- order(ss, decreasing=T)[1:25]
plot(red, wht, col='gray', xlab="Red Centroid", ylab="White Centroid")
text(red[i], wht[i], names(red[i]), cex=0.7)

# --- 8 clusters
#		two clusters appear more distinct
set.seed(853)
k8 <- kmeans(as.matrix(slimDTM), centers=8, nstart=5); use.prop=F
set.seed(853)
k8 <- kmeans(   prop           , centers=8, nstart=5); use.prop=T

matrix(c(k3$totss,  k3$tot.withinss, k3$betweenss,
         k5$totss,  k5$tot.withinss, k5$betweenss,
         k8$totss,  k8$tot.withinss, k8$betweenss),
       nrow=3, byrow=T)

table(k8$cluster)

CrossTable(Wine$color, k8$cluster, prop.t=F,chisq=T, format='SAS', digits=2)

if(use.prop) {iRed <- 8; iWht <- 5} else {iRed <- 3; iWht <- 2}
redCluster <- k5$cluster == iRed
whtCluster <- k5$cluster == iWht
Wine[ which(redCluster)[1:10], c("color","label")]
Wine[ which(whtCluster)[1:20], c("color","label")]

#		means look similar to 5 solution (with proportions)
red <- k8$centers[iRed,]
wht <- k8$centers[iWht,]
ss  <- sqrt(red^2+wht^2)
i   <- order(ss, decreasing=T)[1:25]
plot(red, wht, col='gray', xlab="Red Centroid", ylab="White Centroid")
text(red[i], wht[i], names(red[i]), cex=0.7)

#		lengths not terribly relevant between red and wht clusters
boxplot(str_length(Wine$description) ~ as.factor(k8$cluster))

#		ratings differ as well
boxplot(Wine$points ~ as.factor(k8$cluster), xlab="Cluster", ylab="Rating Points")

red <- k8$centers[iRed,]
vg  <- k8$centers[2,]		# adjust
ss  <- sqrt(red^2+vg^2)
i   <- order(ss, decreasing=T)[1:25]
plot(red, vg, col='gray', xlab="Red Centroid", ylab="Highly Rated Centroid")
text(red[i], vg[i], names(red[i]), cex=0.7)

# --- wordcloud using means
require(wordcloud)
range(red)
par(mfrow=c(1,2))
	wordcloud(names(red), floor(100*red/max(red)), min.freq=3)
	wordcloud(names(wht), floor(100*wht/max(wht)), min.freq=3)
par(mfrow=c(1,1))

#		time effect?
plot(redCluster)
plot(whtCluster)

plot(Wine[,"color"]=="Red")