### ------------------------------------------------- ### ### Analysis of tweets ### ### ------------------------------------------------- # see Twitter for instructions to obtain key/secret and oauth access tokens # https://dev.twitter.com/oauth/overview # https://dev.twitter.com/oauth/overview/application-owner-access-tokens ### ------------------------------------------------- ### ### Access twitter data using account ### ### ------------------------------------------------- rm(list=ls()) # clean slate require(twitteR) require(httr) consumerKey <- "xxxxxxxxxx" # orginals in twitter_setup.R consumerSecret <- "xxxxxxxxxx" # Get access tokens by clicking on the button on the twitter page for the app to test OAuth accessToken <- "xxxxxxxxx" accessSecret<- "xxxxxxxxx" setup_twitter_oauth(consumerKey, consumerSecret, accessToken, accessSecret) # reply 1 # check it out tweets <- searchTwitter('#DonaldTrump', n=500); head(tweets) crantastic <- getUser('crantastic'); crantastic$getDescription() crantastic$getFollowersCount() crantastic$getFriends(n=5) # --- recover saved data load("~/courses/mich/text_analytics/data/raw_trump_tweets_20Jul15.Rsav"); ls() length(tweets) load("~/courses/mich/text_analytics/data/raw_trump_tweets_20Jul15.Rsav"); ls() length(tweets) # --- can remove, but will keep all # tweets <- strip_retweets(tweets, strip_manual=TRUE, strip_mt=TRUE) # --- make data frame and explore Data <- twListToDF(tweets); dim(Data) colnames(Data) Data$statusSource[1:5] temp <- gsub("", "", Data$statusSource) temp <- gsub("<.+>", "", temp) t <- table(temp); t[t>2] sum(Data$isRetweet) plot(Data[Data$isRetweet,"retweetCount"]) # --- clean source text text <- Data[!Data$isRetweet,"text"] text <- gsub("\\n", "", text) text <- gsub('\\"', "", text) text <- gsub('http[^[:blank:]]+', "", text) text <- gsub('@[^[:blank:]]*', "", text) text <- gsub('&', "", text) Encoding(text) <- "latin1" text <- iconv(text, "latin1", "ASCII", sub="") # --- convert vector to tm corpus library(tm) library(SnowballC) # or you might get error when strip white space twts <- VCorpus(VectorSource(text)); twts # writeCorpus saves to file inspect(twts[1:3]) twts <- tm_map(twts, stripWhitespace) # , mc.cores=1 twts <- tm_map(twts, content_transformer(tolower)) twts <- tm_map(twts, removePunctuation) twts <- tm_map(twts, function(x)removeWords(x,stopwords())) # twts <- tm_map(twts, removeNumbers), mc.cores=1) # twts <- tm_map(docs, removeWords, c("donald", "trump")) # toString <- content_transformer(function(x, from, to) gsub(from, to, x)) # twts <- tm_map(twts, toString, "ccc ddd", "HIT") # optional stemming # twts <- tm_map(twts, stemDocument, mc.cores=1) library(wordcloud) wordcloud(twts) # --- build document-term matrix for better control dtm <- DocumentTermMatrix(twts) # bag of words, sparse rep dtm <- DocumentTermMatrix(twts, control=list(weighting= weightTfIdf)) dtm[1:3,1:6] inspect(dtm[1:3,1:6]) findFreqTerms(dtm, 10) findFreqTerms(dtm, 5) # --- Zipf? freq <- colSums(as.matrix(dtm)) length(freq); head(freq) regr <- zipf_plot(freq, n.label=4, n.fit=300) # --- word associations (weighting matters) findAssocs(dtm, "apologize", 0.5) findAssocs(dtm, "mexican", 0.5) findAssocs(dtm, "foxnews", 0.5) # --- reduced matrix sdtm <- removeSparseTerms(dtm, 0.99) # remove 98% or more sparse sdtm mat <- as.matrix(sdtm); dim(mat) freq <- colSums(mat) j <- order(freq, decreasing=T) freq <- freq[j] mat <- mat[,j] colnames(mat)[1:5] wordcloud(colnames(mat)[-(1:2)],freq[-(1:2)]) wordcloud(colnames(mat)[-(1:4)],freq[-(1:4)])