#--------------------------------------------------------------------------- # # Bigrams and word embedding # #--------------------------------------------------------------------------- require (tm) require (stringr) setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") load("R_scripts/Wine.Rdata"); dim(Wine) # 20508 x 10 load("R_scripts/oovFreq.Rdata"); dim(oovFreq) # 20508 x 2645 ############################################################################ compute_bigram <- function(corpus, vocab) { m <- length(vocab) B <- matrix(0, nrow=m, ncol=m) colnames(B) <- rownames(B) <- vocab n <- length(corpus) for(i in 1:n) { wrds <- str_split(corpus[i]," ")[[1]] indx <- match(wrds, vocab) for(j in 2:length(indx)) B[indx[j-1],indx[j]] <- B[indx[j-1],indx[j]]+1 } return(B) } corpus <- c("the quick brown fox", "the brown rabbit dug a hole", " brown dog dug") vocab <- sort(c("the", "quick", "brown", "fox", "rabbit", "dug", "a", "hole", "dog")) compute_bigram(corpus, vocab) # --- build bigram matrix (some remove stopwords as well or retain punct) corpus <- Corpus(VectorSource(Wine$description)) corpus[[1]]$content corpus <- tm_map(corpus, content_transformer(tolower)) corpus[[1]]$content corpus <- tm_map(corpus, content_transformer(removeNumbers)) corpus <- tm_map(corpus, content_transformer(removePunctuation)) corpus[[1]]$content DTM <- DocumentTermMatrix(corpus) # Note: might want to remove rare words/code OOV dim(DTM) # see methods in wine_data.R for OOV subst vocab <- colnames(DTM) typeFreq <- colSums(as.matrix(DTM)) o <- order(typeFreq, decreasing=T) # put common terms first typeFreq <- typeFreq[o] vocab <- vocab[o] B <- compute_bigram(corpus, vocab) dim(B) # 6527 x 6527 B[1:6,1:6] # --- compute SVD via random projection after scaling B <- B/sqrt(typeFreq) B <- t( t(B)/sqrt(typeFreq) ) types <- rownames(B) udv <- random_projection_svd(B, 50, power.iter=2) plot(udv$d) dim(udv$u) plot(udv$u[,1], udv$u[,2], col='gray') size <- sqrt(rowSums(udv$u[,1:2]^2)); big <- size>0.1 text(udv$u[big,1], udv$u[big,2], types[big], cex=0.7) j1 <- 3; j2 <- 6 plot(udv$u[,j1], udv$u[,j2], col='gray') size <- sqrt(rowSums(udv$u[,c(j1,j2)]^2)); big <- size>0.1 text(udv$u[big,j1], udv$u[big,j2], types[big], cex=0.7) draw_u <- function(j1,j2) { plot(udv$u[,j1], udv$u[,j2], col='gray', xlab=substitute("U"[j1], list(j1=j1)), ylab=substitute("U"[j2], list(j2=j2))) size <- sqrt(rowSums(udv$u[,c(j1,j2)]^2)); big <- size>0.1 text(udv$u[big,j1], udv$u[big,j2], types[big], cex=0.7) } draw_u(j1,j2)