##################################################################################
#		(Extracted from topic_models_3.R)
#
#  This version uses the Blei-McAuliffe sLDA model
#
# Given α and K topics with distributions β and regression coef η, var σ2
# Generate a document as follows...
# 1. Draw topic proportions θ | α ∼ Dir(α). 
# 2. For each word w_n
# 		(a) Draw topic assignment z_n |θ ∼ Mult(θ)		z_n indicates topic
# 		(b) Draw word w_n|z_n,β_{1:K} ∼ Mult(β_{z_n})		
# 3. Draw response variable 
# 		y|z_{1:N},η,σ2 ∼ N(η'z-bar,σ2).
#  z-bar is the average of the z's for document, the average topic assignment 
#  the topic composition of all words in the document. (proportions of topics)
#  Regressors are empirical rather than expected topic proportions.
#
#   For more info see:
#      topicmodels: An R Package for Fitting Topic Models (JSS paper)
#
##################################################################################

setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")

# --- preliminary functions

rdirichlet <- function(a) {
    y <- rgamma(length(a), a, 1)
    return(y / sum(y))
	}

round(rdirichlet(rep( .04, 10)),3)		# small alpha: very concentrated in one category
round(rdirichlet(rep( .40 ,10)),3)
round(rdirichlet(rep(4.00 ,10)),3)		# large alpha: more equally distributed


# --- examples of entropy (measure of uncertainty)

entropy <- function (p) {				
	p <- p[p>0]
	p <- p/sum(p)
	return(-sum(p*log(p)))
	}

entropy(c(1,0,0,0))						# no entropy, no uncertainty
entropy(c(.5,0,.5,0))
entropy(c(.25,.25,.25,.25))				# high entropy, high uncertainty

##################################################################################
#
#		P holds topic distributions over words (rows)
#		Sort P and Z (observed counts) to have largest first
#
##################################################################################

#
#	Simulate data from a topic model: explore the word counts in the documents
#
#	An integer represents each unique word type.
#
#	Topic model works with K = 5 topics, alpha.P=0.050, avg len = 250
#		not so clear          			         0.100	
#
n.outlier.words <- 0			# reserved for outlier listing (optional)
n.vocab <- 1000 				# total vocab
K <- 5						# number of topics
P <-matrix(0,nrow=K,ncol=n.vocab-n.outlier.words)	# dist over words for each topic; pad with 0
	
# --- generate and plot topic distributions
alpha.P <- 0.10   			# small alpha implies high skew, less overlap  [ 0.05 0.10 ]
set.seed(6382)
for(i in 1:K) P[i,] <- rdirichlet(rep(alpha.P,n.vocab-n.outlier.words))	
P <- P[,order(colSums(P), decreasing=TRUE)]		# sort so types with big total prob are first
P <- cbind(P, matrix(0, nrow(P), n.outlier.words))

	round(P[,1:5], 3)
	plot(P[1,], xlab="Vocabulary", ylab="Probability")	# weights on specific words
	plot(sqrt(P[1,]),sqrt(P[2,]),		# disjoint if alpha = 0.01, some common if .1
		xlab=expression("P"[1]),ylab=expression("P"[2]))     	

# --- label low-entropy words with appropriate topic (used later in plots)
ent <- apply(P,2,entropy)
low.ent <- ent < 0.5						# low entropy -> predictable
wordTypeTopic <- rep("Mix", ncol(P))
wordTypeTopic[low.ent] <- paste("Topic ", apply(P[,low.ent],2,which.max))
#	check assignment
	plot(ent, xlab="word types", ylab="entropy")
	abline(h=0.5, col='gray')
	wordTypeTopic[1:5]
	round(P[,1:5],2)
	
# --- generate samples from mixture of topics
alpha.T <- rep(0.4,K)				# mix of topics within documents
n     <- 5000						# documents
theta <- matrix(0,nrow=n,ncol=K)		# expected topic mix in each document
Z	  <- matrix(0,nrow=n,ncol=K)		# will hold realized number of words from each topic
avg.len <- 250			  			# avg document length

set.seed(67983)
doc.len <- sort(rpois(n,avg.len), decreasing=TRUE) 
for(i in 1:n) {						
	theta[i,] <- rdirichlet(alpha.T)		
	Z[i,] <- as.vector(rmultinom(1,doc.len[i],theta[i,]))
}
	# plot the mix of the k topics in a document
	plot(theta[1,], xlab="Topic", ylab="Share of Vocabulary", main="Topic Mix for One Document")
	
# --- generate document/term matrix W and its expected value
W <- matrix(0,nrow=n, ncol=n.vocab)	 
W.ev <- W;								# expected value of W
for(i in 1:n) {	
	W.ev[i,] <- doc.len[i] * theta[i,] %*% P # prob distribution over words for each doc		
	for(k in 1:K) if(Z[i,k]>0) W[i,]<-W[i,]+rmultinom(1,Z[i,k],P[k,])
}
	
	# --- check that W counts are centered on their means for doc with lots
	plot(rowSums(W.ev), rowSums(W))		# perfect corr since use Poisson numbers
	
	
# --- Zipf?  Right initial shape, but not nearly so steep as needed (|slope| < 1)

	freq <- apply(W,2,sum)
	sort.freq <- sort(freq[freq>0],decreasing=TRUE) 
	lf <- log(sort.freq); lr <- log(1:length(sort.freq))
	plot(sort.freq,log="xy", xlab="Word Rank",ylab="Frequency")
	lf <- lf[1:100]; lr <- lr[1:100]
	regr <- lm(lf ~ lr); coefficients(summary(regr)); 
	lines(exp(predict(regr)), col="red")
	cat(sum(freq==0)," unused words\n");

	
##################################################################################
#
#   Analyze term/document counts from topic model
#
##################################################################################

dim(W)

# --- marginals  Leave zeros for outlier insertion/handling
fj <- colSums(W)
ni <- rowSums(W)  # = doc.len
	
# --- LSA analysis, raw counts
# udv <- svd(W); U <- udv$u; V <- udv$v
		
# plot(udv$d[1:100], log="xy", xlab="Component", ylab="Singular Value",
# 			main="Raw Frequencies")
	
# --- LSA analysis, CCA scaled
#		gap is evident... but this will be problem later
recip.sqrt <- function(n) { if(n>0) return(1/sqrt(n)) else return(1)}
W.cca   <- sapply(ni,recip.sqrt) * W
W.cca   <- t( sapply(fj,recip.sqrt) * t(W.cca) )
udv.cca <- svd(W.cca)
U.cca <- udv.cca$u	
V.cca <- udv.cca$v
		
	plot(udv.cca$d[1:100], log="xy", xlab="Component", ylab="Singular Value",
			main="CCA Normalization")

# --- leading singular vectors u_1,v_1 determined by word frequency

	# par(mfrow=c(1,2))
	# 	plot(fj, udv	$v[,1], xlab=expression("f"["j"]),ylab=expression("V"[1]))
	# 	plot(ni, udv$u[,1], xlab=expression("n"["i"]),ylab=expression("U"[1]))
	# reset()

	# CCA produces 'mathematical' fit for FIRST component U and loadings V
	par(mfrow=c(1,2))
		plot(sqrt(ni),-udv.cca$u[,1], xlab=expression(sqrt("n"["i"])),ylab=expression("U"[1]))
		plot(sqrt(fj),-udv.cca$v[,1], xlab=expression(sqrt("f"["j"])),ylab=expression("V"[1]))
	reset()


# --- structure of singular vectors, loadings highlight those with large magnitude overall
#		cca components have less 'pointy' loadings
#	  topics are quite evident in the CCA weighted LSA analysis

	j <- 2; k <- j+1;  
		
	# plot(V[,j],V[,k],         xlab=paste0("V_raw(",j,")"), ylab=paste("V_raw(",k,")"))
	
	plot(V.cca[,j],V.cca[,k], xlab=paste0("V_cca(",j,")"), ylab=paste("V_cca(",k,")"))

	length(V.cca[,j])
	
	rownames(V.cca) <- wordTypeTopic
	plot_loadings(V.cca, 2, 3, 0.07,cex=0.7)
	plot_loadings(V.cca, 4, 5, 0.07,cex=0.7)


##################################################################################
#
#		topicmodels package: first run using simulated data from above
#
##################################################################################

require(tm)
require(stringr)
require(topicmodels)

# --- convert integers stored in simulation above to string of "words"
dim(W)
min(rowSums(W))
sum(0==colSums(W))

#		convert integers to strings, keeping 3+ digits long for tm
#		first digit is topic (100s for Topic 1, 200s for topic 2, etc)

	rep(1:3,c(1,0,2))
num_to_text <- function(x) {	paste(as.character(rep(101:(100+length(x)),x)), collapse=" ")  }

	sum(W[1,])						# number words in first doc
	W[1,1:10]
	w <- num_to_text(W[1,])		
	w
	length(str_split(w," ")[[1]])	# number words in first doc

phonyText <- apply(W, 1, num_to_text)
length(phonyText)


# --- convert to corpus, and then to DTM

corpus <- Corpus(VectorSource(phonyText))

DTM <- DocumentTermMatrix(corpus, control=list(minWordLength = 3,removeNumbers = FALSE))
	DTM
	sum(colSums(W)>0)		# match term count in DTM
	sum(W>0)					# match entropy count in DTM

#		Run the LDA code (latent Dirichlet allocation)
	?LDA
n.topics <- 5
lda <- LDA(DTM, method="VEM", n.topics, control = list(seed = 3746))

	lda
	names(lda)
	typeof(lda)				# S class of object
	names(attributes(lda))		

	plot(lda)				# ???
	summary(lda)				# Humm?

	attributes(lda)$class
	attributes(lda)$control

	attributes(lda)$alpha					# close to simulated value?
	attributes(lda)$k						# num topics
	attributes(lda)$terms					# word types

	dim(attributes(lda)$beta)				# prob dist over word types, P  (not normalized)
	dim(attributes(lda)$gamma)				# mix of topics over documents
	attributes(lda)$gamma[1:3,]				# for first 3 documents
	round( t(P[,1:3]  ),4)					# actual simulated mix
	rowSums(attributes(lda)$gamma[1:3,])		# sum to 1
	dim(attributes(lda)$wordassignments)		# reproduction of DTM
	attributes(lda)$wordassignments
	attributes(lda)$Dim						# of the dtm
	attributes(lda)$n						# number tokens
	length(attributes(lda)$loglikelihood)	# one for each doc
	attributes(lda)$iter

	top <- t(topics(lda, 2)); top[1:10,]		# most likely topics for each document (rows)
	round(theta[1:10,],2)					# true topic mix (may be permutation, so have to see next)

	ter <- terms(lda, 20); ter				# most likely terms for each topic???

# --- look up actual word topic associations
#	subtract 100 to index into vector of types
	wordTypeTopic[1:10]

	as.numeric(ter[,1])-100					# "Topic 1" is Topic 5
	wordTypeTopic[as.numeric(ter[,1])-100]

	wordTypeTopic[as.numeric(ter[,2])-100]	# "Topic 2" is Topic 3 


##################################################################################
#
#		topicmodels: run using example data that comes with package
#
##################################################################################

data("AssociatedPress", package = "topicmodels")

	dim(AssociatedPress)
	str(AssociatedPress)

lda <- LDA(AssociatedPress[1:20,], method="VEM", control = list(alpha = 0.1), k = 2)
lda_inf <- posterior(lda, AssociatedPress[21:30,])


##################################################################################
#
#		topicmodels: run using wine tasting data
#
##################################################################################

setwd("~/courses/mich/text_analytics/")
load("R_scripts/oovDTM.Rdata")

oovDTM								# documents: 20508, terms: 2645

#		Run the LDA for varying number of topics
n.topics <- 5
lda <- LDA(oovDTM, method="VEM", n.topics, control = list(seed = 3746))
	slot(lda,"alpha")			# large values indicate diffuse topics
	attributes(lda)$alpha

top <- topics(lda,2)					# find top two topics for each document
	dim(top)
	top[,1:15]

ter <- terms(lda, 20); 				# most likely terms for each topic
	dim(ter)
	ter[,1:2]

logLik(lda)							#  2 'log Lik.' -3942118 (df=5319)
									#  5 'log Lik.' -3943946 (df=13296)
									# 10 'log Lik.' -3945909 (df=26591)
									# 15 'log Lik.' -3947288 (df=39886)
posterior(lda)