#--------------------------------------------------------------------------- # # Spectral methods # #--------------------------------------------------------------------------- # install.packages("tm" , dependencies=T) # install.packages("gmodels", dependencies=T) # CrossTable # install.packages("AUC" , dependencies=T) # install.packages("tree" , dependencies=T) # install.packages("lsa" , dependencies=T) # latent semantic analysis (will not use) # install.packages("glmnet" , dependencies=T) setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") source("R_scripts/missing_data.R") load("R_scripts/Wine.Rdata") ; dim(Wine) # 20508 13 load("R_scripts/oovFreq.Rdata"); dim(oovFreq) # 20508 2659 mean.na <- function(x) mean(x,na.rm=T) #--------------------------------------------------------------------------- # # LSA analysis of wines # #--------------------------------------------------------------------------- # require (lsa) might as well roll our own # --- try the SVD on a sample of documents and see if R can handle it # 3000 rows is easily doable, just not very fast set.seed(37337) i <- i.subset <- sort(sample (1:nrow(oovFreq), 3000)) # save those indices m <- colSums(oovFreq[i,]) # some word types not in sample sum(0 == m) C <- oovFreq[i,m>0]; dim(C) # 3000 x 2250 Wine.subset <- Wine[i,] # subset of full data set # raw counts udv <- svd(C) plot(udv$d[1:400], log="", main="Spectrum of Raw Frequencies", xlab="Dimension", ylab="Singular Value") # CCA scaling ni <- rowSums(C) mj <- colSums(C) C.cca<- C/sqrt(ni) C.cca <- t( t(C.cca)/sqrt(mj) ) udv.cca <- svd(C.cca) udv.cca$d[1:4] plot(udv.cca$d[1:50], log="", main="Spectrum with CCA Scaling", xlab="Dimension", ylab="Singular Value") # log tf-idf scaling C.tfidf <- log(1+C) df <- colSums(C>0) C.tfidf <- t( t(C.tfidf) * log(1+nrow(C)/df) ) udv.tfidf <- svd(C.tfidf) plot(udv.tfidf$d[1:600], log="xy", main="Spectrum with tf-idf Scaling", xlab="Dimension", ylab="Singular Value") # prob scaling n <- rowSums(C) C.prob<- C/n rowSums(C.prob)[1:4] udv.prob<- svd(C.prob) plot(udv.prob$d[1:600], log="xy", main="Spectrum with Probability Scaling", xlab="Dimension", ylab="Singular Value") # --- What are the singular vectors? len<- rowSums(C) par(mfrow=c(1,2)) plot(len, udv.cca$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="CCA") plot(len, udv.tfidf$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="TF-IDF") reset() plot(len, udv.prob$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="Prob") plot(udv.cca$u[,2],udv.prob$u[,2]) # --- Look at other singular vectors set.seed(253) ii <- sample(1:nrow(C), 500) # fewer points in plot pairs(udv.cca$u[ii,2:5], labels=c(expression("U"[2]), expression("U"[3]), # subscripts in plot label expression("U"[4]), expression("U"[5])) ) color <- Wine.subset$color[ii] col <- rep('white',length(color)) col[color=="Red"] <- 'red' col[color=="White"] <- 'gold' pairs(udv.cca$u[ii,2:5], labels=c(expression("U"[2]), expression("U"[3]), expression("U"[4]), expression("U"[5])), col=col ) # identify color? 2526 out of 3000 have known color Wine.subset$U2 <- udv.cca$u[,2] # add to data frame Wine.subset$U3 <- udv.cca$u[,3] Wine.subset$U4 <- udv.cca$u[,4] Wine.subset$U5 <- udv.cca$u[,5] # boxplots indicate we can use these to discriminate red from white par(mfrow=c(1,2)) boxplot(U4 ~ color, data=Wine.subset, ylim=c(min(Wine.subset$U4), 0.10), ylab="U4") abline(h=0,col='gray') boxplot(U5 ~ color, data=Wine.subset, ylim=c(min(Wine.subset$U5), 0.10), ylab="U5") abline(h=0,col='gray') reset() (tab <- table(Wine.subset$color, Wine.subset$U4<0 & Wine.subset$U5>0)) # See if logistic regression does better Wine.subset$y <- Wine.subset$color == 'Red' knowColor <- ! is.na(Wine.subset$color) summary(lr <- glm(y ~ U4 + U5, data=Wine.subset, subset=knowColor, family='binomial')) fit <- predict(lr, type='response') (tab <- table(Wine.subset$color[knowColor], fit<0.37)) # --- show words for leading components V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C) plot(V[,2], V[,3], xlab= expression("V"[2]), ylab=expression("V"[3]), col='gray') label <- 0.10 < sqrt(V[,2]^2 + V[,3]^2) text(V[label,2], V[label,3], rownames(V)[label], cex=0.7, srt=45) # as a function (text_utils.R) plot_loadings(V, 2, 3, 0.08, cex=0.5) # v2... v3 nose plot_loadings(V, 4, 5, 0.08, cex=0.5) # flavors plot_loadings(V, 6, 7, 0.08, cex=0.5) # flavors # --- What are the clusters? col <- rep('white',length(Wine.subset$color)) # placeholder col[Wine.subset$color=="Red"] <- 'red' col[Wine.subset$color=="White"] <- 'gold' # three evident clusters plot(Wine.subset$U2, Wine.subset$U3, xlab=expression("U"[2]),ylab=expression("U"[3]), col=col) par(mfrow=c(2,1)) plot(i.subset, Wine.subset$U2, xlab="File Position",ylab=expression("U"[2]) , col=col) plot(i.subset, Wine.subset$U3, xlab="File Position",ylab=expression("U"[3]) , col=col) reset() # look at text for the clusters Wine$description[i.subset[Wine.subset$U2 > 0.01]] [1:5] Wine$description[i.subset[Wine.subset$U2 < 0.02 & Wine.subset$U3 > 0.05 ]] [1:5] Wine$description[i.subset[Wine.subset$U2 < 0.02 & Wine.subset$U3 < -0.02 ]] [1:5] # look at 2 more components... what happened plot(Wine.subset$U4, Wine.subset$U5, xlab=expression("U"[4]),ylab=expression("U"[5])) par(mfrow=c(2,1)) plot(i.subset, Wine.subset$U4, xlab="File Position",ylab=expression("U"[4]), col=col) plot(i.subset, Wine.subset$U5, xlab="File Position",ylab=expression("U"[5]), col=col) reset() Wine$description[i.subset[Wine.subset$U4 > 0.05 & Wine.subset$U5 > 0.05 ]] [1:5] # zoom in par(mfrow=c(2,1)) plot(i.subset, Wine.subset$U4, xlab="File Position",ylab=expression("U"[4]), xlim=c(9000,9600)) abline(h=0,col='gray') plot(i.subset, Wine.subset$U5, xlab="File Position",ylab=expression("U"[5]), xlim=c(9000,9600)) abline(h=0,col='gray') reset() Wine$description[i.subset[which.max(Wine.subset$U4)]+c(-1,0,1)] # 9437 Wine$description[i.subset[which.max(Wine.subset$U5)]+c(-1,0,1)] # 9360 # return to source text to see what's happening... Wine[9150:9200,"description"] Wine[9150:9280,"date"] #--------------------------------------------------------------------------- # # LSA analysis of wines: foody period # #--------------------------------------------------------------------------- set.seed(2562) i <- i.subset2 <- 12500:15500 # start with 3001 # big <- 1892; i <- i.subset2 <- i[-big] # reduces from 3001 to 3000 reviews # big <- 2149; i <- i.subset2 <- i[-big] # now 2999 m <- colSums(oovFreq[i,]) sum(0 == m) C <- oovFreq[i,m>0]; dim(C) Wine.subset <- Wine[i,] Wine.subset$description[sample(1:3001,20)] # raw counts udv <- svd(C) plot(udv$d[1:600], log="xy", main="Spectrum of Raw Frequencies", xlab="Dimension", ylab="Singular Value") # CCA scaling ni <- rowSums(C) mj <- colSums(C) C.cca<- C/sqrt(ni) C.cca <- t( t(C.cca)/sqrt(mj) ) udv.cca <- svd(C.cca); udv.cca$d[1:4] plot(udv.cca$d[1:600], log="xy", main="Spectrum with CCA Scaling", xlab="Dimension", ylab="Singular Value") # length and first component ni <- rowSums(C) plot(ni, udv.cca$u[,1], xlab="Number of Tokens", ylab="First Singular Vector", main="CCA") # --- leading singular vectors U <- udv.cca$u[,1:20] pairs(U[,2:5], labels=c(expression("U"[2]), expression("U"[3]), expression("U"[4]), expression("U"[5])) ) # --- more outliers? length(big <- which(U[,3] < -0.5)); U[big,5] big # 1392 Wine.subset$description[big] # Yuck! lets remove this one and see what happens # --- another outliers (watch out for <-) length(big <- which(U[,5] > 0.4)); U[big,5] big # 2798 Wine.subset$description[big] # unusual words, but not out of character # --- examine a few more singular vectors ... another big straggler pairs(U[,6:9], labels=c(expression("U"[6]), expression("U"[7]), expression("U"[8]), expression("U"[9])) ) # --- several big ones length(big <- which(U[,7] < -0.4 | U[,8] < -0.4)); U[big,6:8] big Wine.subset$description[big] # Yuck! lets remove? # dependent? U <- udv.cca$u[,1:5]; round(t(U) %*% U, 5) round(cor(U)) # --- which words load on components V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C) j <- 3; hist(V[,j], breaks=50, main=substitute("Loadings V"[j], list(j=j))) boxplot(V[,j], add=T, horizontal=T, at=0) ii <- order(abs(V[,j]), decreasing=T)[1:40] V[ii[1:15],j] text(V[ii[1:15],j], 25, rownames(V)[ii[1:15]], cex=0.7, srt=45) colSums(C[,ii[1:15]]) # common, or used in just a few reviews? # --- show words for leading components V <- udv.cca$v[,1:25]; rownames(V) <- colnames(C) plot(V[,2], V[,3], xlab= expression("V"[2]), ylab=expression("V"[3]), col='gray') label <- 0.07 < sqrt(V[,2]^2 + V[,3]^2) text(V[label,2], V[label,3], rownames(V)[label], cex=0.7, srt=45) # as a function plot_loadings(V, 2, 3, 0.08) plot_loadings(V, 4, 5, 0.08) sort(V[,5])[1:4] plot_loadings(V, 6, 7, 0.08) plot_loadings(V, 8, 9, 0.08) plot_loadings(V, 10, 11, 0.08) plot_loadings(V, 12, 13, 0.07) plot_loadings(V, 14, 15, 0.07) plot_loadings(V, 16, 17, 0.07) plot_loadings(V, 18, 19, 0.07) plot_loadings(V, 20, 21, 0.07) plot_loadings(V, 22, 23, 0.07) plot_loadings(V, 24, 25, 0.07) # words V <- udv.cca$v[,1:20] plot(V[,7], V[,6], xlab= expression("U"[7]), ylab=expression("V"[6]), col='gray') label <- 0.1 < sqrt(V[,7]^2 + V[,6]^2) text(V[label,7], V[label,6], colnames(C)[label], cex=0.7, srt=45) sum(big <- U[,9] < -0.4); U[big,9] Wine.subset$description[big] v9 <- udv.cca$v[,9] hist(v9, breaks=50) boxplot(v9, add=T, horizontal=T, at=0) j <- order(abs(v9), decreasing=T)[1:40] v9[j[1:15]] # are they common words? colSums(C[,j[1:15]]) ##################################################################################### # # Random projection alternative # ##################################################################################### # --- return to original problem; first compute SVD as in original example set.seed(37337) i <- i.subset <- sort(sample (1:nrow(oovFreq), 3000)) m <- colSums(oovFreq[i,]) # some word types not in sample sum(0 == m) C <- oovFreq[i,m>0]; dim(C) # 3000 x 2250 udv <- svd(C); plot(udv$d[1:100], ylab="Singular Values", main="Exact") # most energy in leading few # --- compare to random projection (code in text_utils.R) udv.rp <- random_projection_svd(C, 100, verbose=T) dim(udv.rp$u) plot(udv.rp$d, ylab="Singular Values", main="Random Projection") # compare resulting singular values x <- udv.rp$d; y <- udv$d[1:100] plot(x,y, xlab="Singular Values, Random Projection", ylab="Singular Values") abline( r <- lm(y~x) ); summary(r)$r.squared # recovery of subspaces (impact of power iterations) ?cancor cca <- cancor(udv$u[,1:50], udv.rp$u[,1:50]) names(cca) plot(cca$cor[1:25], ylim=c(0.925,1), ylab="Canonical Correlation") # role of power iterations (power.iter=1 by default) udv.rp0 <- random_projection_svd(C, 100, power.iter=0, verbose=T) udv.rp2 <- random_projection_svd(C, 100, power.iter=2, verbose=T) cca.0 <- cancor(udv$u[,1:50], udv.rp0$u[,1:50]) points (cca.0$cor[1:25], pch=19, col='gray') cca.2 <- cancor(udv$u[,1:50], udv.rp2$u[,1:50]) points (cca.2$cor[1:25], pch=19, col='red') # --- build SVD of full data set, use for colors and points hasColor <- !is.na(Wine[,"color"]); sum(hasColor) Data <- data.frame(Wine[hasColor,c("color","alcohol","vintage","price","points")]) Data <- fill.missing(Data) dim(Data) # 17336 x 9 colnames(Data) set.seed(38) test <- sample(1:nrow(Data), 10000) train <- (1:nrow(Data))[-test] # do random projection *before* split... Transductive inference C <- oovFreq[hasColor,] mj <- colSums(C) C <- C[,mj>0] mj <- colSums(C) ni <- rowSums(C) C <- C/sqrt(ni) C <- t( t(C)/sqrt(mj) ) dim(C) # 17336 x 2601 s <- rowSums(C) # check! any(is.na(s)) s[1:20] udv.rp <- random_projection_svd(C, 200, power.iter=2, verbose=T) plot(udv.rp$d[1:50], log="", main="RP Spectrum with CCA Scaling", xlab="Dimension", ylab="RP Singular Value") Data <- data.frame(Data, U=udv.rp$u) # 17336 207 colnames(Data) # look at the principal components plot(Data[,"U.2"]) # --- Fit regression for points # 10 get R2 up to 60%, 50 gets 64%, 100 gets 65% sum(Data[,"Miss.points"]) nDim <- 100 vars <- paste(paste("U",1:nDim,sep="."), sep="+", collapse='+') formula <- as.formula(paste("points ~ ", vars)) regr <- lm(formula, data=Data[train,]) summary(regr) # --- use lasso with all colummns from data frame require(glmnet) Y <- Data[,"color"] # glmnet uses explict X, Y, not formula model <- as.formula(" ~ .") # everything X <- model.matrix(model, data=Data[,-1]) [,-1] # remove Y, drop intercept dim(X) # 17336 x 206 lasso.regr <- glmnet(X[train,], Y[train], family='binomial', alpha=1) plot(lasso.regr) # inspect coefficients names(lasso.regr) coef <- predict(lasso.regr,s=.001,exact=T,type="coefficients") # some zero, s is penalty c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]] as.matrix(m[c!=0,]) coef <- predict(lasso.regr,s=.010,exact=T,type="coefficients") # most zero, larger penalty c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]] as.matrix(m[c!=0,]) coef <- predict(lasso.regr,s=.100,exact=T,type="coefficients") # all but U4, U6 = 0 c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]] as.matrix(m[c!=0,]) # use 10 fold CV to pick tuning parameter lambda (aka, s) set.seed(6) cv.out<-cv.glmnet(X[train,], Y[train], family='binomial', alpha=1) plot(cv.out) (cv.out$lambda.min) (bestlam<-cv.out$lambda.1se) # work with the relatively sparse model coef<-predict(lasso.regr,s=bestlam,exact=T,type="coefficients") c <- as.vector(coef); m <- matrix(c, ncol=1); rownames(m) <- attributes(coef)$Dimnames[[1]] as.matrix(m[c!=0,]) sum(as.vector(c) != 0) # many not 0 # high probabilities... calibrated? lasso.pred <- predict(lasso.regr, s=bestlam, newx=X[test,], type="response") y01 <- ifelse(Data[test,"color"]=="White",1,0) # oops plot(lasso.pred, y01, col='gray', xlab="Estimated Pr(White)", ylab="True Color") o <- order(lasso.pred) x <- lasso.pred[o] y <- y01[o] r <- lm(y ~ poly(x,6)) lines(x,fitted(r),col='red'); abline(a=0,b=1,col='gray') abline(h=0.5, col='gray', lty=3) i <- which(0.80 < x & x < 0.85); length(i) points(mean(x[i]), mean(y[i])) # ROC require(AUC) tmp <- roc(lasso.pred, as.factor(Data[test,"color"])) # 0.995 auc(tmp) plot(tmp) table(Data[test,"color"], lasso.pred>0.5) confusion_summary(Data[test,"color"]=="White", lasso.pred>0.5)