#---------------------------------------------------------------------------
#
#	Bigrams and word embedding
#
#---------------------------------------------------------------------------
require (tm)
require (stringr)

setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")


load("R_scripts/Wine.Rdata"); dim(Wine)	# 20508 x 10
load("R_scripts/oovFreq.Rdata"); dim(oovFreq)	# 20508 x 2645

############################################################################

compute_bigram <- function(corpus, vocab) {
	m <- length(vocab)
	B <- matrix(0, nrow=m, ncol=m)
	colnames(B) <- rownames(B) <- vocab
	n <- length(corpus)
	for(i in 1:n) {
		wrds <- str_split(corpus[i]," ")[[1]]
		indx <- match(wrds, vocab)
		for(j in 2:length(indx)) B[indx[j-1],indx[j]] <- B[indx[j-1],indx[j]]+1
	}
	return(B)
}

corpus <- c("the quick brown fox", "the brown rabbit dug a hole", " brown dog dug")
vocab  <- sort(c("the", "quick", "brown", "fox", "rabbit", "dug", "a", "hole", "dog"))
compute_bigram(corpus, vocab)


# --- build bigram matrix  (some remove stopwords as well or retain punct)

corpus <- Corpus(VectorSource(Wine$description))
	corpus[[1]]$content

corpus <- tm_map(corpus, content_transformer(tolower))
	corpus[[1]]$content

corpus <- tm_map(corpus, content_transformer(removeNumbers))

corpus <- tm_map(corpus, content_transformer(removePunctuation))
	corpus[[1]]$content

DTM <- DocumentTermMatrix(corpus)		# Note: might want to remove rare words/code OOV
dim(DTM)									#		see methods in wine_data.R for OOV subst

vocab <- colnames(DTM)
typeFreq <- colSums(as.matrix(DTM))

o <- order(typeFreq, decreasing=T)		# put common terms first
typeFreq  <- typeFreq[o]
vocab <- vocab[o]

B <- compute_bigram(corpus, vocab)
dim(B)		# 6527 x 6527
B[1:6,1:6]


# --- compute SVD via random projection after scaling

B <- B/sqrt(typeFreq)
B <- t( t(B)/sqrt(typeFreq) )
types <- rownames(B)

udv <- random_projection_svd(B, 50, power.iter=2)
	plot(udv$d)

	dim(udv$u)
	
	plot(udv$u[,1], udv$u[,2], col='gray')
	size <- sqrt(rowSums(udv$u[,1:2]^2)); big <- size>0.1
	text(udv$u[big,1], udv$u[big,2], types[big], cex=0.7)

	j1 <- 3; j2 <- 6
	
	plot(udv$u[,j1], udv$u[,j2], col='gray')
	size <- sqrt(rowSums(udv$u[,c(j1,j2)]^2)); big <- size>0.1
	text(udv$u[big,j1], udv$u[big,j2], types[big], cex=0.7)


draw_u <- function(j1,j2) {
	plot(udv$u[,j1], udv$u[,j2], col='gray', 
		xlab=substitute("U"[j1], list(j1=j1)), ylab=substitute("U"[j2], list(j2=j2)))
	size <- sqrt(rowSums(udv$u[,c(j1,j2)]^2)); big <- size>0.1
	text(udv$u[big,j1], udv$u[big,j2], types[big], cex=0.7)
	}

	draw_u(j1,j2)