#--------------------------------------------------------------------------- # # Regression problems # #--------------------------------------------------------------------------- # install.packages("tm" , dependencies=T) # install.packages("leaps", dependencies=T) # if want to use stepwise # install.packages("glmnet" , dependencies=T) # install.packages("wordcloud", dependencies=T) require (tm) setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") source("R_scripts/missing_data.R") # fill.missing load("R_scripts/Wine.Rdata") ; dim(Wine) load("R_scripts/oovFreq.Rdata") ; dim(oovFreq) # common first, 20508 2645 #--------------------------------------------------------------------------- # # Predicting wine rating # #--------------------------------------------------------------------------- # --- build unified data frame for modeling head(colSums(oovFreq)) oovFreq[1:5,1:10] lengths<- rowSums(oovFreq) # number tokens in each document props <- oovFreq / lengths # or sqrt length (Poisson) rowSums(props[1:4,]) # each row sums to 1 dim(props) # 20508 2645 # Fill missing data before join to text or include response sum(is.na(Wine[,"points"])) # missing 179 hasPoints <- !is.na(Wine[,"points"]) Data <- data.frame(Wine[hasPoints,c("points","alcohol","vintage","price", "color")]) apply(Data,2,function(x) sum(is.na(x))) # Special fix for color to avoid have 3 "colors" (Red, White, Missing) head(Data$color) ifelse(head(Data$color)=="Red",1,0) Data$color <- ifelse(Data$color=="Red", 1, 0) head(Data$color) # Still missing cases of color, but now will treat as numerical apply(Data,2,function(x) sum(is.na(x))) Data <- fill.missing(Data) apply(Data,2,function(x) sum(is.na(x))) # more columns, none missing dim(Data) # 20329 x 9 # Add the proportions and lengths Data <- data.frame(Data, lengths=lengths[hasPoints], props[hasPoints,]) dim(Data) # 20329 x 2654 colnames(Data)[c(1:15)] round(Data[1:10,10:20],2) # lengths, then proportions # --- split into test and train data (make harder by saving a lot for testing!) set.seed(38) test <- sample(1:nrow(Data), 5000) train <- (1:nrow(Data))[-test] # --- regression highly significant without words others <- "alcohol+vintage+price+lengths+color+Miss.alcohol+Miss.vintage+Miss.price+Miss.color" model <- as.formula(paste("points ~",others,collapse="")) model summary(lr <- lm(model, data=Data[train,])) # about 1/3 of variation # --- fit on common words instead has about same explained variation, and gets # better and better as continue to add features. n.words <- 20 # 20 then 40 colnames(oovFreq)[1:n.words] (words <- paste(colnames(oovFreq)[1:n.words],collapse="+")) model <- as.formula(paste("points ~",words,collapse="")) model summary(lr <- lm(model, data=Data[train,])) # --- Much better with length (since others are proportions) words <- str_c("lengths+",words) model <- as.formula(paste("points ~ ",words,collapse="")) summary(lr <- lm(model, data=Data[train,])) # --- Better still with the other factors (collinear, so subadditive) model <- as.formula(paste("points ~",others,"+",words,collapse="")) summary(lr <- lm(model, data=Data[train,])) # --- Keep adding more words and the fit improves # Problem: R has problems manipulating formulas longer than 500 characters # Soln: build the data frame for the regression differently. n.words <- 80 colnames(Data)[1:10] # variables that are *not* word proportions RegrData <- Data[,1:(9+n.words)] summary(lr <- lm("points ~ .", data=RegrData[train,])) # --- predict points in test... meet claimed fit? ols.pred <- predict(lr, newdata=RegrData[test,]) olspred[1:10] cor(olspred, Data[test,"points"])^2 # R^2 out of sample sd( ols.pred-Data[test,"points"] ) # SD of prediction errors out of sample # --- calibration plot(ols.pred, Data[test,"points"], col='gray', xlab="Estimated Points", ylab="True Points") o <- order(ols.pred) x <- pred[o] y <- Data[test[o],"points"] r <- loess(y ~ x) lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # --- Lasso for feature selection points example require(glmnet) ?glmnet Y <- Data[,"points"] # glmnet wants explict X, Y -- not formula # first model with 500 words as in logistic (then hundreds) n.words <- 500 colnames(Data)[1:10] # variables that are *not* word proportions X <- as.matrix(Data[,2:(10+n.words)] ) # Note 2 leaves out the Y variable dim(X) # 20329 x 89 (9 + n.words) lasso.regr <- glmnet(X[train,], Y[train], family='gaussian', alpha=1) # alpha=1 means lasso plot(lasso.regr) # inspect coefficients names(lasso.regr) predict(lasso.regr, s=0, exact=T,type="coefficients") # coefficients, s = lambda predict(lasso.regr,s=.010,exact=T,type="coefficients") # some zeroed out predict(lasso.regr,s=.400,exact=T,type="coefficients") # more zeroed predict(lasso.regr,s=.800,exact=T,type="coefficients") # only price and length predict(lasso.regr,s=.650,exact=T,type="coefficients") # some words appear # use 10-fold CV to pick tuning parameter lambda (aka, s) # chooses lots of terms set.seed(6) cv.out<-cv.glmnet(X[train,], Y[train], family='gaussian', alpha=1) plot(cv.out) (bestlam<-cv.out$lambda.min) (bestlam<-cv.out$lambda.1se) # work with the relatively sparse model (c<-predict(lasso.regr,s=bestlam,exact=T,type="coefficients")) (sum(as.vector(c) != 0)) # Predictive? lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,]) cor(lasso.pred, Data[test,"points"])^2 # R^2 out of sample sd( lasso.pred-Data[test,"points"] ) # SD of prediction errors out of sample # calibrated? lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,]) plot(lasso.pred, Data[test,"points"], col='gray', xlab="Estimated Points", ylab="Actual Points") o <- order(lasso.pred) x <- lasso.pred[o] y <- Data[test[o],"points"] r <- loess(y ~ x) lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray') # --- compare to ls estimates mat <- cbind(Data[test,"points"], ols.pred, lasso.pred) colnames(mat) <- c("Actual", "OLS", "Lasso") pairs(mat) # --- look at coefficients (cloud) require(wordcloud) require(stringr) c <- predict(lasso.regr, s=bestlam, exact=T, type="coefficients") typeof(c) str(c) attributes(c) coef_cloud <- function(coefs, color=F) { v <- as.vector(coefs) nm <- attributes(coefs)$Dimnames[[1]] wordcloud(nm[-1], abs(v[-1])/max(abs(v[-1])) * 100, random.color=color) } coef_cloud(c, TRUE)