setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") #-------------------------------------------------------------------------------------- # # Wine sentiment analysis # #-------------------------------------------------------------------------------------- require(stringr) require(tm) # require(gmodels) # --- recover data (20508 x 10) load("R_scripts/Wine.Rdata"); dim(Wine) # --- basic sentiment analysis: does the score correlate with points? # check out the WordNet package to find other synonyms, antonyms, etc # See the CRAN documentation for information pos.words <- scan("data/positive-words.txt", what=character()); pos.words[1:10]; length(pos.words) neg.words <- scan("data/negative-words.txt", what=character()); neg.words[1:10]; length(neg.words) (s <- str_replace_all("this? is text, ,with:punct!" ,"[.,;:?!]"," ")) score.text <- function(s, count=T) { # quick and dirty if(is.na(s)|(str_length(s)==0)) return(c(NA,NA,NA)) s <- str_replace(str_to_lower(s),"[.,;:?!]"," ") wrds <- unlist(str_split(s, "\\s+")) # split on space pos <- wrds %in% pos.words neg <- wrds %in% neg.words if (count) return(c(sum(pos), sum(neg), length(wrds))) else return(list(pos=wrds[pos], neg=wrds[neg])) } Wine$description[1] score.text(Wine$description[1]) score.text(Wine$description[1], count=F) # is lemon really bad? Wine$description[2] score.text(Wine$description[2], count=F) scores <- sapply(Wine$description[1:10], score.text) # eek! labels by full text scores <- sapply(Wine$description[1:10], score.text, USE.NAMES=F) # better scores <- sapply(Wine$description, score.text, USE.NAMES=F) scores <- cbind(t(scores)) colnames(scores) <- c("pos", "neg", "n") rownames(scores) <- NULL head(scores) summary(scores) # --- related to points, price dif <- scores[,"pos"]-scores[,"neg"] plot(dif, Wine$points, cex=0.5,xlab="Sentiment Score", ylab="Points") dither <- function (x, s = 0.05) { x + s*sd(x, na.rm=T)*rnorm(length(x)) } plot(dither(dif), dither(Wine$points), cex=0.5, xlab="Pos-Neg Sentiment Score", ylab="Points") summary(regr <- lm(Wine$points ~ dif)); abline(regr, col="red") # 14% mean.na <- function(x) mean(x,na.rm=T) y <- tapply(Wine$points,as.factor(dif),mean.na, simplify=T) x <- as.numeric(names(y)) lines(x,y,lty=2,col='blue', lwd=3) i <- sample(1:nrow(scores), 2000) plot(dither(scores[i,"pos"]), dither(Wine$points[i]), cex=0.5, xlab="Positive Count", ylab="Points") summary(regr <- lm(Wine$points ~ scores[,"pos"])); abline(regr, col="red") # 15% plot(dither(scores[i,"neg"]), dither(Wine$points[i]), cex=0.5, xlab="Negative Count", ylab="Points") summary(regr <- lm(Wine$points ~ scores[,"neg"])); abline(regr, col="red") # 1% # --- multiple regression posCount <- scores[,"pos"] negCount <- scores[,"neg"] totCount <- scores[,"n" ] summary(regr <- lm(Wine$points ~ posCount + negCount)); # 15% cor(posCount,negCount) summary(regr <- lm(Wine$points ~ posCount + negCount + totCount)); # 25% cor(cbind(posCount,negCount,totCount)) i <- sample(1:length(totCount), 1000) plot(totCount[i], dither(posCount[i])) # --- use proportions? (heteroscedasticity) posProp <- posCount/totCount negProp <- negCount/totCount summary(regr <- lm(Wine$points ~ posProp + negProp + totCount));