################################################################################## # (Extracted from topic_models_3.R) # # This version uses the Blei-McAuliffe sLDA model # # Given α and K topics with distributions β and regression coef η, var σ2 # Generate a document as follows... # 1. Draw topic proportions θ | α ∼ Dir(α). # 2. For each word w_n # (a) Draw topic assignment z_n |θ ∼ Mult(θ) z_n indicates topic # (b) Draw word w_n|z_n,β_{1:K} ∼ Mult(β_{z_n}) # 3. Draw response variable # y|z_{1:N},η,σ2 ∼ N(η'z-bar,σ2). # z-bar is the average of the z's for document, the average topic assignment # the topic composition of all words in the document. (proportions of topics) # Regressors are empirical rather than expected topic proportions. # # For more info see: # topicmodels: An R Package for Fitting Topic Models (JSS paper) # ################################################################################## setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") # --- preliminary functions rdirichlet <- function(a) { y <- rgamma(length(a), a, 1) return(y / sum(y)) } round(rdirichlet(rep( .04, 10)),3) # small alpha: very concentrated in one category round(rdirichlet(rep( .40 ,10)),3) round(rdirichlet(rep(4.00 ,10)),3) # large alpha: more equally distributed # --- examples of entropy (measure of uncertainty) entropy <- function (p) { p <- p[p>0] p <- p/sum(p) return(-sum(p*log(p))) } entropy(c(1,0,0,0)) # no entropy, no uncertainty entropy(c(.5,0,.5,0)) entropy(c(.25,.25,.25,.25)) # high entropy, high uncertainty ################################################################################## # # P holds topic distributions over words (rows) # Sort P and Z (observed counts) to have largest first # ################################################################################## # # Simulate data from a topic model: explore the word counts in the documents # # An integer represents each unique word type. # # Topic model works with K = 5 topics, alpha.P=0.050, avg len = 250 # not so clear 0.100 # n.outlier.words <- 0 # reserved for outlier listing (optional) n.vocab <- 1000 # total vocab K <- 5 # number of topics P <-matrix(0,nrow=K,ncol=n.vocab-n.outlier.words) # dist over words for each topic; pad with 0 # --- generate and plot topic distributions alpha.P <- 0.10 # small alpha implies high skew, less overlap [ 0.05 0.10 ] set.seed(6382) for(i in 1:K) P[i,] <- rdirichlet(rep(alpha.P,n.vocab-n.outlier.words)) P <- P[,order(colSums(P), decreasing=TRUE)] # sort so types with big total prob are first P <- cbind(P, matrix(0, nrow(P), n.outlier.words)) round(P[,1:5], 3) plot(P[1,], xlab="Vocabulary", ylab="Probability") # weights on specific words plot(sqrt(P[1,]),sqrt(P[2,]), # disjoint if alpha = 0.01, some common if .1 xlab=expression("P"[1]),ylab=expression("P"[2])) # --- label low-entropy words with appropriate topic (used later in plots) ent <- apply(P,2,entropy) low.ent <- ent < 0.5 # low entropy -> predictable wordTypeTopic <- rep("Mix", ncol(P)) wordTypeTopic[low.ent] <- paste("Topic ", apply(P[,low.ent],2,which.max)) # check assignment plot(ent, xlab="word types", ylab="entropy") abline(h=0.5, col='gray') wordTypeTopic[1:5] round(P[,1:5],2) # --- generate samples from mixture of topics alpha.T <- rep(0.4,K) # mix of topics within documents n <- 5000 # documents theta <- matrix(0,nrow=n,ncol=K) # expected topic mix in each document Z <- matrix(0,nrow=n,ncol=K) # will hold realized number of words from each topic avg.len <- 250 # avg document length set.seed(67983) doc.len <- sort(rpois(n,avg.len), decreasing=TRUE) for(i in 1:n) { theta[i,] <- rdirichlet(alpha.T) Z[i,] <- as.vector(rmultinom(1,doc.len[i],theta[i,])) } # plot the mix of the k topics in a document plot(theta[1,], xlab="Topic", ylab="Share of Vocabulary", main="Topic Mix for One Document") # --- generate document/term matrix W and its expected value W <- matrix(0,nrow=n, ncol=n.vocab) W.ev <- W; # expected value of W for(i in 1:n) { W.ev[i,] <- doc.len[i] * theta[i,] %*% P # prob distribution over words for each doc for(k in 1:K) if(Z[i,k]>0) W[i,]<-W[i,]+rmultinom(1,Z[i,k],P[k,]) } # --- check that W counts are centered on their means for doc with lots plot(rowSums(W.ev), rowSums(W)) # perfect corr since use Poisson numbers # --- Zipf? Right initial shape, but not nearly so steep as needed (|slope| < 1) freq <- apply(W,2,sum) sort.freq <- sort(freq[freq>0],decreasing=TRUE) lf <- log(sort.freq); lr <- log(1:length(sort.freq)) plot(sort.freq,log="xy", xlab="Word Rank",ylab="Frequency") lf <- lf[1:100]; lr <- lr[1:100] regr <- lm(lf ~ lr); coefficients(summary(regr)); lines(exp(predict(regr)), col="red") cat(sum(freq==0)," unused words\n"); ################################################################################## # # Analyze term/document counts from topic model # ################################################################################## dim(W) # --- marginals Leave zeros for outlier insertion/handling fj <- colSums(W) ni <- rowSums(W) # = doc.len # --- LSA analysis, raw counts # udv <- svd(W); U <- udv$u; V <- udv$v # plot(udv$d[1:100], log="xy", xlab="Component", ylab="Singular Value", # main="Raw Frequencies") # --- LSA analysis, CCA scaled # gap is evident... but this will be problem later recip.sqrt <- function(n) { if(n>0) return(1/sqrt(n)) else return(1)} W.cca <- sapply(ni,recip.sqrt) * W W.cca <- t( sapply(fj,recip.sqrt) * t(W.cca) ) udv.cca <- svd(W.cca) U.cca <- udv.cca$u V.cca <- udv.cca$v plot(udv.cca$d[1:100], log="xy", xlab="Component", ylab="Singular Value", main="CCA Normalization") # --- leading singular vectors u_1,v_1 determined by word frequency # par(mfrow=c(1,2)) # plot(fj, udv $v[,1], xlab=expression("f"["j"]),ylab=expression("V"[1])) # plot(ni, udv$u[,1], xlab=expression("n"["i"]),ylab=expression("U"[1])) # reset() # CCA produces 'mathematical' fit for FIRST component U and loadings V par(mfrow=c(1,2)) plot(sqrt(ni),-udv.cca$u[,1], xlab=expression(sqrt("n"["i"])),ylab=expression("U"[1])) plot(sqrt(fj),-udv.cca$v[,1], xlab=expression(sqrt("f"["j"])),ylab=expression("V"[1])) reset() # --- structure of singular vectors, loadings highlight those with large magnitude overall # cca components have less 'pointy' loadings # topics are quite evident in the CCA weighted LSA analysis j <- 2; k <- j+1; # plot(V[,j],V[,k], xlab=paste0("V_raw(",j,")"), ylab=paste("V_raw(",k,")")) plot(V.cca[,j],V.cca[,k], xlab=paste0("V_cca(",j,")"), ylab=paste("V_cca(",k,")")) length(V.cca[,j]) rownames(V.cca) <- wordTypeTopic plot_loadings(V.cca, 2, 3, 0.07,cex=0.7) plot_loadings(V.cca, 4, 5, 0.07,cex=0.7) ################################################################################## # # topicmodels package: first run using simulated data from above # ################################################################################## require(tm) require(stringr) require(topicmodels) # --- convert integers stored in simulation above to string of "words" dim(W) min(rowSums(W)) sum(0==colSums(W)) # convert integers to strings, keeping 3+ digits long for tm # first digit is topic (100s for Topic 1, 200s for topic 2, etc) rep(1:3,c(1,0,2)) num_to_text <- function(x) { paste(as.character(rep(101:(100+length(x)),x)), collapse=" ") } sum(W[1,]) # number words in first doc W[1,1:10] w <- num_to_text(W[1,]) w length(str_split(w," ")[[1]]) # number words in first doc phonyText <- apply(W, 1, num_to_text) length(phonyText) # --- convert to corpus, and then to DTM corpus <- Corpus(VectorSource(phonyText)) DTM <- DocumentTermMatrix(corpus, control=list(minWordLength = 3,removeNumbers = FALSE)) DTM sum(colSums(W)>0) # match term count in DTM sum(W>0) # match entropy count in DTM # Run the LDA code (latent Dirichlet allocation) ?LDA n.topics <- 5 lda <- LDA(DTM, method="VEM", n.topics, control = list(seed = 3746)) lda names(lda) typeof(lda) # S class of object names(attributes(lda)) plot(lda) # ??? summary(lda) # Humm? attributes(lda)$class attributes(lda)$control attributes(lda)$alpha # close to simulated value? attributes(lda)$k # num topics attributes(lda)$terms # word types dim(attributes(lda)$beta) # prob dist over word types, P (not normalized) dim(attributes(lda)$gamma) # mix of topics over documents attributes(lda)$gamma[1:3,] # for first 3 documents round( t(P[,1:3] ),4) # actual simulated mix rowSums(attributes(lda)$gamma[1:3,]) # sum to 1 dim(attributes(lda)$wordassignments) # reproduction of DTM attributes(lda)$wordassignments attributes(lda)$Dim # of the dtm attributes(lda)$n # number tokens length(attributes(lda)$loglikelihood) # one for each doc attributes(lda)$iter top <- t(topics(lda, 2)); top[1:10,] # most likely topics for each document (rows) round(theta[1:10,],2) # true topic mix (may be permutation, so have to see next) ter <- terms(lda, 20); ter # most likely terms for each topic??? # --- look up actual word topic associations # subtract 100 to index into vector of types wordTypeTopic[1:10] as.numeric(ter[,1])-100 # "Topic 1" is Topic 5 wordTypeTopic[as.numeric(ter[,1])-100] wordTypeTopic[as.numeric(ter[,2])-100] # "Topic 2" is Topic 3 ################################################################################## # # topicmodels: run using example data that comes with package # ################################################################################## data("AssociatedPress", package = "topicmodels") dim(AssociatedPress) str(AssociatedPress) lda <- LDA(AssociatedPress[1:20,], method="VEM", control = list(alpha = 0.1), k = 2) lda_inf <- posterior(lda, AssociatedPress[21:30,]) ################################################################################## # # topicmodels: run using wine tasting data # ################################################################################## setwd("~/courses/mich/text_analytics/") load("R_scripts/oovDTM.Rdata") oovDTM # documents: 20508, terms: 2645 # Run the LDA for varying number of topics n.topics <- 5 lda <- LDA(oovDTM, method="VEM", n.topics, control = list(seed = 3746)) slot(lda,"alpha") # large values indicate diffuse topics attributes(lda)$alpha top <- topics(lda,2) # find top two topics for each document dim(top) top[,1:15] ter <- terms(lda, 20); # most likely terms for each topic dim(ter) ter[,1:2] logLik(lda) # 2 'log Lik.' -3942118 (df=5319) # 5 'log Lik.' -3943946 (df=13296) # 10 'log Lik.' -3945909 (df=26591) # 15 'log Lik.' -3947288 (df=39886) posterior(lda)