### ANES text

# shrink plot margins
reset <- function (mfrow=c(1,1)) {
	par(mfrow=mfrow, mgp=c(2,1,0),mar=c(3,3,2,1)) # bottom left top right
	}
reset()

# --- read text, each line has a document (text of person's response)
Text <- read.table("~/data/text/anes/joined.txt", header=TRUE)

# --- copy from array, trim  (str_replace_all - aka gsub - is also useful)
name <- "Pelosi"
text <- str_trim(Text[,name])

# --- load the tm library and define corpus
library(tm)

(corpus <- VCorpus( VectorSource(text) ))
inspect( corpus[1:2] )
meta( corpus[[2]] )
meta( corpus[[2]], "id" )

# --- do some processing (downcase, remove punctuation and digits -- optional)
#     content_transformer maintains data types for R functions like tolower
corpus <- tm_map(corpus, content_transformer(tolower))
inspect( corpus[1:2] )

corpus <- tm_map(corpus, removePunctuation)
inspect( corpus[1:2] )

# corpus <- tm_map(corpus, removeNumbers)
# corpus <- tm_map(corpus, removeWords, stopwords("english") )
# corpus <- tm_map(corpus, stemDocument)

corpus <- tm_map(corpus, stripWhitespace)
inspect( corpus[1:2] )

# --- construct document/type matrix
#     use dictionary to restrict to certain words
dtm <- DocumentTermMatrix( corpus )
dim(dtm)
inspect(dtm[1:3,1:10])

# --- Zipf distribution
freq <- colSums(as.matrix(dtm))
hist(freq, breaks=100)

freq <- sort(freq, decreasing=TRUE)
plot(log(1:length(freq)),log(freq))
lf <- log(freq); lx <- log(1:length(freq))
abline(regr <- lm(lf ~ lx), col="red")
summary(regr)

# --- find types that occur more often
findFreqTerms(dtm, 5)

# --- remove those that appear in few documents (max allowed sparsity)
dtm.sparse <- removeSparseTerms(dtm,0.99); dim(dtm.sparse)

# --- R trick
(m <- matrix(1:12,nrow=4))
(1:4) * m       # scales rows
m * (1:3)       # does not scale columns
t( t(m) * 1:3 ) # multiplies columns

# --- svd of dtm (normalize?)
X <- as.matrix(dtm.sparse)

# --- weights (optional)
X <- (1/sqrt(rowSums(X))) * X
X <- t ( t(X) * 1/sqrt(colSums(X)) )

udv <- svd(X); names(udv)

# --- plot singular values
plot(udv$d)
# --- plot principal components
pairs(udv$u[,1:4])

# --- labels of main components
j <- order(abs(udv$v[,2]), decreasing=TRUE)[1:20]
data.frame(colnames(dtm.sparse)[j],udv$v[j,2])