#---------------------------------------------------------------------------
#
#	Spectral methods
#
#---------------------------------------------------------------------------

# install.packages("tm"     , dependencies=T)
# install.packages("gmodels", dependencies=T)	# CrossTable
# install.packages("AUC"    , dependencies=T)
# install.packages("tree"   , dependencies=T)
# install.packages("lsa"    , dependencies=T)	# latent semantic analysis (will not use)
# install.packages("glmnet" , dependencies=T)

setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")
source("R_scripts/missing_data.R")	

load("R_scripts/Wine.Rdata")   ;  dim(Wine)		# 20508    13
load("R_scripts/oovFreq.Rdata");  dim(oovFreq)	# 20508  2659

mean.na <- function(x) mean(x,na.rm=T)

#---------------------------------------------------------------------------
#
#	LSA analysis of wines
#
#---------------------------------------------------------------------------

# require (lsa)  might as well roll our own

# --- try the SVD on a sample of documents and see if R can handle it
#		3000 rows is easily doable, just not very fast

set.seed(37337)
i <- i.subset <- sort(sample (1:nrow(oovFreq), 3000))  # save those indices
m <- colSums(oovFreq[i,])				# some word types not in sample
sum(0 == m)								
C <- oovFreq[i,m>0]; dim(C)				# 3000 x 2250

Wine.subset <- Wine[i,]					# subset of full data set

#		raw counts
udv <- svd(C)

plot(udv$d[1:400], log="", main="Spectrum of Raw Frequencies", 
		xlab="Dimension", ylab="Singular Value")


#		CCA scaling
ni <- rowSums(C)
mj <- colSums(C)
C.cca<- C/sqrt(ni)
C.cca <- t( t(C.cca)/sqrt(mj) )

udv.cca <- svd(C.cca)
udv.cca$d[1:4]		
plot(udv.cca$d[1:50], log="", main="Spectrum with CCA Scaling", 
		xlab="Dimension", ylab="Singular Value")


#		log tf-idf scaling
C.tfidf <- log(1+C)
df <- colSums(C>0)
C.tfidf <- t( t(C.tfidf) * log(1+nrow(C)/df) )
udv.tfidf <- svd(C.tfidf)

plot(udv.tfidf$d[1:600], log="xy", main="Spectrum with tf-idf Scaling", 
		xlab="Dimension", ylab="Singular Value")


#		prob scaling
n <- rowSums(C)
C.prob<- C/n
rowSums(C.prob)[1:4]
udv.prob<- svd(C.prob)
plot(udv.prob$d[1:600], log="xy", main="Spectrum with Probability Scaling", 
		xlab="Dimension", ylab="Singular Value")


# --- What are the singular vectors?

len<- rowSums(C)

par(mfrow=c(1,2))
	plot(len, udv.cca$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="CCA")
	plot(len, udv.tfidf$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="TF-IDF")
reset()

plot(len, udv.prob$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="Prob")

plot(udv.cca$u[,2],udv.prob$u[,2])


# ---  Look at other singular vectors
set.seed(253)
ii <- sample(1:nrow(C), 500)								# fewer points in plot
pairs(udv.cca$u[ii,2:5], 
		labels=c(expression("U"[2]), expression("U"[3]), 	# subscripts in plot label
		         expression("U"[4]), expression("U"[5]))  )

color <- Wine.subset$color[ii]
col <- rep('white',length(color))
col[color=="Red"]   <- 'red'
col[color=="White"] <- 'gold'

pairs(udv.cca$u[ii,2:5], 
		labels=c(expression("U"[2]), expression("U"[3]), 
			   expression("U"[4]), expression("U"[5])),
		col=col  )

#		identify color?   2526 out of 3000 have known color
Wine.subset$U2 <- udv.cca$u[,2]			# add to data frame
Wine.subset$U3 <- udv.cca$u[,3]
Wine.subset$U4 <- udv.cca$u[,4]
Wine.subset$U5 <- udv.cca$u[,5]

# boxplots indicate we can use these to discriminate red from white
par(mfrow=c(1,2))
	boxplot(U4 ~ color, data=Wine.subset, ylim=c(min(Wine.subset$U4), 0.10), ylab="U4")
	abline(h=0,col='gray')
	boxplot(U5 ~ color, data=Wine.subset, ylim=c(min(Wine.subset$U5), 0.10), ylab="U5")
	abline(h=0,col='gray')
reset()

(tab <- table(Wine.subset$color, Wine.subset$U4<0 & Wine.subset$U5>0))

#		See if logistic regression does better
Wine.subset$y  <- Wine.subset$color == 'Red'
knowColor <- ! is.na(Wine.subset$color)
summary(lr <- glm(y ~ U4 + U5, data=Wine.subset, subset=knowColor, family='binomial'))
fit <- predict(lr, type='response')

(tab <- table(Wine.subset$color[knowColor], fit<0.37))


# --- show words for leading components

V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C)

plot(V[,2], V[,3], xlab= expression("V"[2]), ylab=expression("V"[3]), col='gray')
label <- 0.10 < sqrt(V[,2]^2 + V[,3]^2)
text(V[label,2], V[label,3], rownames(V)[label], cex=0.7, srt=45)

#		as a function  (text_utils.R)
plot_loadings(V, 2, 3, 0.08, cex=0.5)		# v2...   v3 nose
plot_loadings(V, 4, 5, 0.08, cex=0.5)   		# flavors
plot_loadings(V, 6, 7, 0.08, cex=0.5)   		# flavors


# --- What are the clusters?
col <- rep('white',length(Wine.subset$color))  # placeholder
col[Wine.subset$color=="Red"]   <- 'red'
col[Wine.subset$color=="White"] <- 'gold'

#		three evident clusters
plot(Wine.subset$U2, Wine.subset$U3, xlab=expression("U"[2]),ylab=expression("U"[3]), col=col) 

par(mfrow=c(2,1))
	plot(i.subset, Wine.subset$U2, xlab="File Position",ylab=expression("U"[2]) , col=col)
	plot(i.subset, Wine.subset$U3, xlab="File Position",ylab=expression("U"[3]) , col=col)
reset()

#		look at text for the clusters
Wine$description[i.subset[Wine.subset$U2 > 0.01]] [1:5]
Wine$description[i.subset[Wine.subset$U2 < 0.02 & Wine.subset$U3 > 0.05 ]] [1:5]
Wine$description[i.subset[Wine.subset$U2 < 0.02 & Wine.subset$U3 < -0.02 ]] [1:5]


#		look at 2 more components... what happened
plot(Wine.subset$U4, Wine.subset$U5, xlab=expression("U"[4]),ylab=expression("U"[5])) 

par(mfrow=c(2,1))
	plot(i.subset, Wine.subset$U4, xlab="File Position",ylab=expression("U"[4]), col=col)
	plot(i.subset, Wine.subset$U5, xlab="File Position",ylab=expression("U"[5]), col=col)
reset()

Wine$description[i.subset[Wine.subset$U4 > 0.05 & Wine.subset$U5 > 0.05 ]] [1:5]

#		zoom in
par(mfrow=c(2,1))
	plot(i.subset, Wine.subset$U4, xlab="File Position",ylab=expression("U"[4]), xlim=c(9000,9600)) 
	abline(h=0,col='gray')
	plot(i.subset, Wine.subset$U5, xlab="File Position",ylab=expression("U"[5]), xlim=c(9000,9600))
	abline(h=0,col='gray')
reset()


Wine$description[i.subset[which.max(Wine.subset$U4)]+c(-1,0,1)] # 9437
Wine$description[i.subset[which.max(Wine.subset$U5)]+c(-1,0,1)] # 9360

# 		return to source text to see what's happening...

Wine[9150:9200,"description"]
Wine[9150:9280,"date"]


#---------------------------------------------------------------------------
#
#	LSA analysis of wines: foody period
#
#---------------------------------------------------------------------------

set.seed(2562)
i <- i.subset2 <- 12500:15500				# start with 3001
# big <- 1892; i <- i.subset2 <- i[-big]	# reduces from 3001 to 3000 reviews
# big <- 2149; i <- i.subset2 <- i[-big]	# now 2999
m <- colSums(oovFreq[i,])				
sum(0 == m)
C <- oovFreq[i,m>0]; dim(C)
Wine.subset <- Wine[i,]

Wine.subset$description[sample(1:3001,20)]

#		raw counts
udv <- svd(C)
plot(udv$d[1:600], log="xy", main="Spectrum of Raw Frequencies", 
		xlab="Dimension", ylab="Singular Value")


#		CCA scaling
ni <- rowSums(C)
mj <- colSums(C)
C.cca<- C/sqrt(ni)
C.cca <- t( t(C.cca)/sqrt(mj) )
udv.cca <- svd(C.cca); udv.cca$d[1:4]		

plot(udv.cca$d[1:600], log="xy", main="Spectrum with CCA Scaling", 
		xlab="Dimension", ylab="Singular Value")

#		length and first component
ni <- rowSums(C)
plot(ni, udv.cca$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="CCA")

# ---  leading singular vectors
U <- udv.cca$u[,1:20]
pairs(U[,2:5], 
		labels=c(expression("U"[2]), expression("U"[3]), 
				 expression("U"[4]), expression("U"[5]))  )

# --- more outliers? 
length(big <- which(U[,3] < -0.5)); U[big,5]
big  # 1392
Wine.subset$description[big]   # Yuck!  lets remove this one and see what happens

# --- another outliers  (watch out for <-)
length(big <- which(U[,5] > 0.4)); U[big,5]
big  # 2798
Wine.subset$description[big]   # unusual words, but not out of character

# ---  examine a few more singular vectors ... another big straggler 
pairs(U[,6:9], 
		labels=c(expression("U"[6]), expression("U"[7]), 
				 expression("U"[8]), expression("U"[9]))  )

# --- several big ones
length(big <- which(U[,7] < -0.4 | U[,8] < -0.4));
U[big,6:8]
big 
Wine.subset$description[big]   # Yuck!  lets remove?
			 

#		dependent? 
U <- udv.cca$u[,1:5]; 
round(t(U) %*% U, 5)
round(cor(U))


# --- which words load on components
V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C)

j <- 3;
hist(V[,j], breaks=50, main=substitute("Loadings V"[j], list(j=j)))
boxplot(V[,j], add=T, horizontal=T, at=0)
ii <- order(abs(V[,j]), decreasing=T)[1:40]
V[ii[1:15],j]
text(V[ii[1:15],j], 25, rownames(V)[ii[1:15]], cex=0.7, srt=45)

colSums(C[,ii[1:15]])	#		common, or used in just a few reviews?


# --- show words for leading components
V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C)

plot(V[,2], V[,3], xlab= expression("V"[2]), ylab=expression("V"[3]), col='gray')
label <- 0.07 < sqrt(V[,2]^2 + V[,3]^2)
text(V[label,2], V[label,3], rownames(V)[label], cex=0.7, srt=45)

#		as a function
plot_loadings(V, 2, 3, 0.08)
plot_loadings(V, 4, 5, 0.08)   

sort(V[,5])[1:4]

plot_loadings(V, 6, 7, 0.08)
plot_loadings(V, 8, 9, 0.08)   

plot_loadings(V, 10, 11, 0.08)   
plot_loadings(V, 12, 13, 0.07)   
plot_loadings(V, 14, 15, 0.07)   
plot_loadings(V, 16, 17, 0.07)
plot_loadings(V, 18, 19, 0.07)   
plot_loadings(V, 20, 21, 0.07)   
plot_loadings(V, 22, 23, 0.07)   
plot_loadings(V, 24, 25, 0.07) 


#		words
V <- udv.cca$v[,1:20]
plot(V[,7], V[,6], xlab= expression("U"[7]), ylab=expression("V"[6]), col='gray')
label <- 0.1 < sqrt(V[,7]^2 + V[,6]^2)
text(V[label,7], V[label,6], colnames(C)[label], cex=0.7, srt=45)

sum(big <- U[,9] < -0.4); U[big,9]
Wine.subset$description[big]


v9 <- udv.cca$v[,9]
hist(v9, breaks=50)
boxplot(v9, add=T, horizontal=T, at=0)
j <- order(abs(v9), decreasing=T)[1:40]

v9[j[1:15]]

#		are they common words?
colSums(C[,j[1:15]])


#####################################################################################
#
#	Random projection alternative
#
#####################################################################################


# --- return to original problem;  first compute SVD as in original example

set.seed(37337)
i <- i.subset <- sort(sample (1:nrow(oovFreq), 3000))
m <- colSums(oovFreq[i,])				# some word types not in sample
sum(0 == m)								
C <- oovFreq[i,m>0]; dim(C)				# 3000 x 2250

udv <- svd(C); 
plot(udv$d[1:100], ylab="Singular Values", main="Exact")		# most energy in leading few


# --- compare to random projection (code in text_utils.R)

udv.rp <- random_projection_svd(C, 100, verbose=T)
dim(udv.rp$u)
plot(udv.rp$d, ylab="Singular Values", main="Random Projection")

#		compare resulting singular values
x <- udv.rp$d; y <- udv$d[1:100]
plot(x,y, xlab="Singular Values, Random Projection", ylab="Singular Values")
abline( r <- lm(y~x) ); summary(r)$r.squared

#		recovery of subspaces (impact of power iterations)
?cancor
cca <- cancor(udv$u[,1:50], udv.rp$u[,1:50])
names(cca)
plot(cca$cor[1:25], ylim=c(0.925,1), ylab="Canonical Correlation")

#		role of power iterations (power.iter=1 by default)
udv.rp0 <- random_projection_svd(C, 100, power.iter=0, verbose=T)
udv.rp2 <- random_projection_svd(C, 100, power.iter=2, verbose=T)

cca.0 <- cancor(udv$u[,1:50], udv.rp0$u[,1:50])
points (cca.0$cor[1:25], pch=19, col='gray')
cca.2 <- cancor(udv$u[,1:50], udv.rp2$u[,1:50])
points (cca.2$cor[1:25], pch=19, col='red')


# --- build SVD of full data set, use for colors and points

hasColor <- !is.na(Wine[,"color"]); sum(hasColor)

Data <- data.frame(Wine[hasColor,c("color","alcohol","vintage","price","points")])
Data <- fill.missing(Data)
dim(Data)		# 17336 x 9
colnames(Data)

set.seed(38)
test  <- sample(1:nrow(Data), 10000)
train <- (1:nrow(Data))[-test]


#		do random projection *before* split... Transductive inference

C <- oovFreq[hasColor,]
mj <- colSums(C)
C <- C[,mj>0]
mj <- colSums(C)
ni <- rowSums(C)
C  <- C/sqrt(ni)
C  <- t( t(C)/sqrt(mj) )
dim(C)				# 17336 x 2601

s <- rowSums(C)		# check!
any(is.na(s))
s[1:20]

udv.rp <- random_projection_svd(C, 200, power.iter=2, verbose=T)

plot(udv.rp$d[1:50], log="", main="RP Spectrum with CCA Scaling", 
		xlab="Dimension", ylab="RP Singular Value")

Data <- data.frame(Data, U=udv.rp$u)		# 17336   207
colnames(Data)

#		look at the principal components

plot(Data[,"U.2"])

# --- Fit regression for points 		# 10 get R2 up to 60%, 50 gets 64%, 100 gets 65%
sum(Data[,"Miss.points"])
nDim <- 100
vars <- paste(paste("U",1:nDim,sep="."), sep="+", collapse='+')
formula <- as.formula(paste("points ~ ", vars))
regr <- lm(formula, data=Data[train,])
summary(regr)
	
# --- use lasso with all colummns from data frame
require(glmnet)

Y <- Data[,"color"]				# glmnet uses explict X, Y, not formula

model <- as.formula(" ~ .") 	# everything
X <- model.matrix(model, data=Data[,-1]) [,-1] # remove Y, drop intercept
dim(X)							# 17336 x 206

lasso.regr <- glmnet(X[train,], Y[train], family='binomial', alpha=1)
plot(lasso.regr)

#		inspect coefficients
names(lasso.regr)

coef <- predict(lasso.regr,s=.001,exact=T,type="coefficients")	# some zero, s is penalty
c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]]
as.matrix(m[c!=0,])

coef <- predict(lasso.regr,s=.010,exact=T,type="coefficients")	# most zero, larger penalty
c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]]
as.matrix(m[c!=0,])

coef <- predict(lasso.regr,s=.100,exact=T,type="coefficients")	# all but U4, U6 = 0
c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]]
as.matrix(m[c!=0,])


#		use 10 fold CV to pick tuning parameter lambda (aka, s)
set.seed(6)									
cv.out<-cv.glmnet(X[train,], Y[train], family='binomial', alpha=1)
	plot(cv.out)
	(cv.out$lambda.min)
	(bestlam<-cv.out$lambda.1se)	# work with the relatively sparse model
coef<-predict(lasso.regr,s=bestlam,exact=T,type="coefficients")

c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]]
as.matrix(m[c!=0,])
sum(as.vector(c) != 0) 								# many not 0

#		high probabilities... calibrated?
lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,], type="response")

y01 <- ifelse(Data[test,"color"]=="White",1,0)			# oops
plot(lasso.pred, y01, col='gray', xlab="Estimated Pr(White)", ylab="True Color")
o <- order(lasso.pred)
x <- lasso.pred[o]
y <- y01[o]
r <- lm(y ~ poly(x,6))
lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray')
abline(h=0.5, col='gray', lty=3)

i <- which(0.80 < x & x < 0.85); length(i)
points(mean(x[i]), mean(y[i]))

#		ROC
require(AUC)
tmp <- roc(lasso.pred, as.factor(Data[test,"color"]))  # 0.995
auc(tmp)
plot(tmp)

table(Data[test,"color"], lasso.pred>0.5)

confusion_summary(Data[test,"color"]=="White", lasso.pred>0.5)