#---------------------------------------------------------------------------
#
#	Regression problems
#
#---------------------------------------------------------------------------

# install.packages("tm"   , dependencies=T)
# install.packages("leaps", dependencies=T)					# if want to use stepwise
# install.packages("glmnet"  , dependencies=T)
# install.packages("wordcloud", dependencies=T)

require (tm)

setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")								
source("R_scripts/missing_data.R")							# fill.missing

load("R_scripts/Wine.Rdata")		;  dim(Wine)
load("R_scripts/oovFreq.Rdata")	;  dim(oovFreq)			# common first, 20508  2645

#---------------------------------------------------------------------------
#
#	Predicting wine rating
#
#---------------------------------------------------------------------------

# --- build unified data frame for modeling

head(colSums(oovFreq))
oovFreq[1:5,1:10]

lengths<- rowSums(oovFreq)						# number tokens in each document

props  <- oovFreq / lengths						# or sqrt length (Poisson)
rowSums(props[1:4,])   							# each row sums to 1
dim(props)  									# 20508  2645


# Fill missing data before join to text or include response
sum(is.na(Wine[,"points"]))						# missing 179
hasPoints <- !is.na(Wine[,"points"])
Data <- data.frame(Wine[hasPoints,c("points","alcohol","vintage","price", "color")])
apply(Data,2,function(x) sum(is.na(x)))

# Special fix for color to avoid have 3 "colors" (Red, White, Missing)
head(Data$color)
ifelse(head(Data$color)=="Red",1,0)
Data$color <- ifelse(Data$color=="Red", 1, 0)
head(Data$color)
# Still missing cases of color, but now will treat as numerical
apply(Data,2,function(x) sum(is.na(x)))

Data <- fill.missing(Data)
apply(Data,2,function(x) sum(is.na(x)))				# more columns, none missing
dim(Data)										# 20329 x 9

# Add the proportions and lengths
Data <- data.frame(Data, lengths=lengths[hasPoints], props[hasPoints,])
dim(Data)										# 20329 x 2654
colnames(Data)[c(1:15)]
round(Data[1:10,10:20],2)						# lengths, then proportions


# --- split into test and train data (make harder by saving a lot for testing!)
set.seed(38)
test  <- sample(1:nrow(Data), 5000)
train <- (1:nrow(Data))[-test]


# --- regression highly significant without words
others  <- "alcohol+vintage+price+lengths+color+Miss.alcohol+Miss.vintage+Miss.price+Miss.color"

model  <- as.formula(paste("points ~",others,collapse=""))
model
summary(lr <- lm(model, data=Data[train,]))			# about 1/3 of variation

# --- fit on common words instead has about same explained variation, and gets
#	 better and better as continue to add features.
n.words <- 20 									# 20 then 40
colnames(oovFreq)[1:n.words]
(words  <- paste(colnames(oovFreq)[1:n.words],collapse="+"))
model  <- as.formula(paste("points ~",words,collapse=""))
model
summary(lr <- lm(model, data=Data[train,]))

# --- Much better with length (since others are proportions)
words <- str_c("lengths+",words)
model  <- as.formula(paste("points ~ ",words,collapse=""))
summary(lr <- lm(model, data=Data[train,]))

# --- Better still with the other factors (collinear, so subadditive)
model <- as.formula(paste("points ~",others,"+",words,collapse=""))
summary(lr <- lm(model, data=Data[train,]))

# --- Keep adding more words and the fit improves
#     Problem: R has problems manipulating formulas longer than 500 characters
#	 Soln: build the data frame for the regression differently.
n.words <- 80
colnames(Data)[1:10]				# variables that are *not* word proportions
RegrData <- Data[,1:(9+n.words)]		
summary(lr <- lm("points ~ .", data=RegrData[train,]))

# --- predict points in test... meet claimed fit?
ols.pred <- predict(lr, newdata=RegrData[test,])
olspred[1:10]

cor(olspred, Data[test,"points"])^2		# R^2 out of sample
sd( ols.pred-Data[test,"points"] )			# SD of prediction errors out of sample

# --- calibration
plot(ols.pred, Data[test,"points"], col='gray', xlab="Estimated Points", ylab="True Points")
o <- order(ols.pred)
x <- pred[o]
y <- Data[test[o],"points"]
r <- loess(y ~ x)
lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray')


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# --- Lasso for feature selection points example
require(glmnet)
?glmnet

Y <- Data[,"points"]					# glmnet wants explict X, Y -- not formula

#		first model with 500 words as in logistic  (then hundreds)
n.words <- 500
colnames(Data)[1:10]						# variables that are *not* word proportions
X <- as.matrix(Data[,2:(10+n.words)]	)	# Note 2 leaves out the Y variable	
dim(X)								# 20329 x 89    (9 + n.words)
lasso.regr <- glmnet(X[train,], Y[train], family='gaussian', alpha=1) # alpha=1 means lasso
plot(lasso.regr)

#		inspect coefficients
names(lasso.regr)
 
predict(lasso.regr, s=0, exact=T,type="coefficients")	# coefficients, s = lambda

predict(lasso.regr,s=.010,exact=T,type="coefficients")	# some zeroed out
predict(lasso.regr,s=.400,exact=T,type="coefficients")	# more zeroed
predict(lasso.regr,s=.800,exact=T,type="coefficients")	# only price and length
predict(lasso.regr,s=.650,exact=T,type="coefficients")	# some words appear

#		use 10-fold CV to pick tuning parameter lambda (aka, s)
#		chooses lots of terms
set.seed(6)									
cv.out<-cv.glmnet(X[train,], Y[train], family='gaussian', alpha=1)
	plot(cv.out)
	(bestlam<-cv.out$lambda.min)
	(bestlam<-cv.out$lambda.1se)	  # work with the relatively sparse model
(c<-predict(lasso.regr,s=bestlam,exact=T,type="coefficients"))
(sum(as.vector(c) != 0))

# Predictive?
lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,])

cor(lasso.pred, Data[test,"points"])^2		# R^2 out of sample
sd( lasso.pred-Data[test,"points"] )		# SD of prediction errors out of sample

# calibrated?
lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,])
plot(lasso.pred, Data[test,"points"], col='gray', xlab="Estimated Points", ylab="Actual Points")
o <- order(lasso.pred)
x <- lasso.pred[o]
y <- Data[test[o],"points"]
r <- loess(y ~ x)
lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray')

# --- compare to ls estimates
mat <- cbind(Data[test,"points"], ols.pred, lasso.pred)
colnames(mat) <- c("Actual", "OLS", "Lasso")
pairs(mat)

# --- look at coefficients (cloud)
require(wordcloud)
require(stringr)
c <- predict(lasso.regr, s=bestlam, exact=T, type="coefficients")
typeof(c)
str(c)
attributes(c)
coef_cloud <- function(coefs, color=F) {
	v <- as.vector(coefs)	
	nm <- attributes(coefs)$Dimnames[[1]]
	wordcloud(nm[-1], abs(v[-1])/max(abs(v[-1])) * 100, random.color=color)
}
coef_cloud(c, TRUE)