### ANES text # shrink plot margins reset <- function (mfrow=c(1,1)) { par(mfrow=mfrow, mgp=c(2,1,0),mar=c(3,3,2,1)) # bottom left top right } reset() # --- read text, each line has a document (text of person's response) Text <- read.table("~/data/text/anes/joined.txt", header=TRUE) # --- copy from array, trim (str_replace_all - aka gsub - is also useful) name <- "Pelosi" text <- str_trim(Text[,name]) # --- load the tm library and define corpus library(tm) (corpus <- VCorpus( VectorSource(text) )) inspect( corpus[1:2] ) meta( corpus[[2]] ) meta( corpus[[2]], "id" ) # --- do some processing (downcase, remove punctuation and digits -- optional) # content_transformer maintains data types for R functions like tolower corpus <- tm_map(corpus, content_transformer(tolower)) inspect( corpus[1:2] ) corpus <- tm_map(corpus, removePunctuation) inspect( corpus[1:2] ) # corpus <- tm_map(corpus, removeNumbers) # corpus <- tm_map(corpus, removeWords, stopwords("english") ) # corpus <- tm_map(corpus, stemDocument) corpus <- tm_map(corpus, stripWhitespace) inspect( corpus[1:2] ) # --- construct document/type matrix # use dictionary to restrict to certain words dtm <- DocumentTermMatrix( corpus ) dim(dtm) inspect(dtm[1:3,1:10]) # --- Zipf distribution freq <- colSums(as.matrix(dtm)) hist(freq, breaks=100) freq <- sort(freq, decreasing=TRUE) plot(log(1:length(freq)),log(freq)) lf <- log(freq); lx <- log(1:length(freq)) abline(regr <- lm(lf ~ lx), col="red") summary(regr) # --- find types that occur more often findFreqTerms(dtm, 5) # --- remove those that appear in few documents (max allowed sparsity) dtm.sparse <- removeSparseTerms(dtm,0.99); dim(dtm.sparse) # --- R trick (m <- matrix(1:12,nrow=4)) (1:4) * m # scales rows m * (1:3) # does not scale columns t( t(m) * 1:3 ) # multiplies columns # --- svd of dtm (normalize?) X <- as.matrix(dtm.sparse) # --- weights (optional) X <- (1/sqrt(rowSums(X))) * X X <- t ( t(X) * 1/sqrt(colSums(X)) ) udv <- svd(X); names(udv) # --- plot singular values plot(udv$d) # --- plot principal components pairs(udv$u[,1:4]) # --- labels of main components j <- order(abs(udv$v[,2]), decreasing=TRUE)[1:20] data.frame(colnames(dtm.sparse)[j],udv$v[j,2])