# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# Amazon ratings example, Blitzer XML format
#
# Build time series of sentiments for product ratings
# in selected category
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
require(stringr)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Build first data frame using the R XML package
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
require(tm)
require(XML)
# need a wrapper for XML files (recursive structure needs root node)
xmlTreeParse(" bob stine jane doe ", asText=T)
xmlTreeParse(" bob stine jane doe ", asText=T)
# --- sample file with ≈1000 lines from all.review
fileName <- "~/data/text/blitzer/sorted_data/automotive/sample.txt"
# & has special use in XML
xmlTree <- xmlTreeParse(fileName) # apostrophe coded in way we don't want
xmlTree <- xmlParse(fileName) # better
root <- xmlRoot(xmlTree) # extract text
xmlName(root)
xmlSize(root) # number of children (reviews)
root[[1]] # first review
xmlName(root[[1]]) # review
xmlSize(root[[1]]) # 11 elements
sapply(xmlChildren(root[[1]]), xmlName) # recursive data structure
df <- xmlToDataFrame(xmlTree,stringsAsFactors=F) # also xmlToList
df <- xmlToDataFrame(fileName, stringsAsFactors=F) # bypass object
dim(df)
names(df)
head(df) # padded by \n
df$asin
df$rating # will need to convert to numbers
df$date
# --- Move to complete file, direct to data frame
# clean.txt required some manual editing to remove odd characters
# such as "snake eyes" that R and XML package would not read correctly.
# XML format does not allow certain characters in the text. See the Makefile
cleanFileName <- "~/data/text/blitzer/sorted_data/automotive/clean.txt"
df <- xmlToDataFrame(cleanFileName, stringsAsFactors=F) # direct to data frame
dim(df)
names(df)
df$rating <- as.numeric(df$rating) # happy
table(df$rating) # polarized!
d <- as.Date(df$date) # not so happy
d <- df$date
d <- str_replace(d,'\n','')
d <- str_replace_all(d,'\n','')
d <- as.Date(d) # not so happy
df$date <- as.Date(d,format="%B %d, %Y")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Alternative is to "roll your own" parsing routines
# Might be easier than learning special package such as
# XML and is also more flexible.
# Will you need it again? What else does package do?
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rawFileName <- "~/data/text/blitzer/sorted_data/automotive/all.review"
readLines(rawFileName, encoding="UTF-8", n=100)
# --- use file connection to read (don't need here, but would with really big file"
fl <- file(rawFileName, open='r', encoding="UTF-8")
showConnections()
fl <- file(rawFileName, open='r', encoding="UTF-8")
scan(fl,what='character',nlines=2) # first two items
scan(fl,what='character',nlines=10) # next 10 items separated by "whitespace"
scan(fl,what='character',nlines=10, sep='\n') # next 10 items separated by "whitespace"
close(fl)
txtData <- readLines(rawFileName)
length(txtData) # 26602
"abc" == "abc"
"abc" == "adc"
length(iRating <- which(txtData == ""))
rating <- txtData[iRating+1]
length(rating) # sanity check
rating <- as.numeric(rating)
mean(rating)
table(rating)
extract_text <- function(tag) { # leave big text array as global to avoid copying
fileTag <- paste0("<",tag,">")
i <- which(txtData == fileTag)
cat(length(i), "lines match", fileTag, "\n")
return(txtData[i+1])
}
names <- c("date", "rating", "review_text")
data <- lapply(names, extract_text)
Data <- data.frame(data)
colnames(Data) <- names
# --- be very careful converting to numeric from factor
str(Data) # check format
Data[,2] <- as.numeric(Data[,2])
table(Data[,2]) # What happened to the 5's?
# --- keep strings as strings
Data <- data.frame(data, stringsAsFactors = FALSE)
colnames(Data) <- names
str(Data)
Data[,2] <- as.numeric(Data[,2])
table(Data[,2]) # The fives are back!
Data[1,1]
as.Date(Data[1,1])
as.Date(Data[1,1],format="%B %d, %Y")
Data[53,1] # Hummm... missing?
Data[53,1] <- "August 1, 2001"
Data[,1] <- as.Date(Data[,1],format="%B %d, %Y")
any(is.na(Data))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Basic exploratory data analysis
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
hist(Data$date)
hist(Data$date, breaks=12)
# --- plot ratings on date
plot(rating ~ date, data=Data)
fit <- loess(rating ~ date, data=Data)
Data$days <- as.numeric(Data$date)
fit <- loess(rating ~ days, data=Data)
lines(fit) # oops!
lines(Data$date, fitted.values(fit))
plot(rating ~ date, data=Data)
o <- order(fit$x)
lines(fit$x[o], fit$fitted[o])
# --- plot ratings on length
Data$length <- sapply(Data$review_text, str_length)
plot(rating ~ length, data=Data)
# --- plot length on date
plot(length ~ date, data=Data, log='y')
Data $logLength <- log(Data $length)
fit <- loess(logLength ~ days, data= Data, span=0.3)
o <- order(fit$x)
lines(fit$x[o], exp(fit$fitted[o]), col='red', lwd=2)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# NLP tools
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
require(tm)
require(NLP)
require(openNLP)
require(openNLPmodels.en)
Data[1,"review_text"]
ann <- annotate(Data[1,"review_text"],
list(Maxent_Sent_Token_Annotator(),
Maxent_Word_Token_Annotator(),
Maxent_Entity_Annotator(kind='organization')) ) # did not recognize
ann
ann <- annotate(Data[1,"review_text"],
list(Maxent_Sent_Token_Annotator(),
Maxent_Word_Token_Annotator(),
Maxent_POS_Tag_Annotator()) ) # did not recognize
ann
unique(ann$type)
sent <- subset(ann, type=="sentence")
sapply(sent$features, str_length) # interpret components
sent$features[1]
# see doc for AnnotatedPlainTextDocument for further information
doc <- AnnotatedPlainTextDocument(Data[1,"review_text"], ann)
sents(doc)
tagged_sents(doc)
tagged_sents(doc, map = Universal_POS_tags_map) # easier to recognize
tagged_words(doc)
parsed_sents(doc) # error?
# distribution of POS tags for word tokens
wrds <- subset(ann, type == "word"); wrds # compare to text
tags <- sapply(wrds$features, function(x) return(x[["POS"]]) )
table(tags)
# --- plot sentiment of text on time, vs rating
require(tm.lexicon.GeneralInquirer)
corpus <- VCorpus(VectorSource(df$review_text))
pos.score <- tm_term_score(TermDocumentMatrix(corpus, control = list(removePunctuation = TRUE)),
terms_in_General_Inquirer_categories("Positiv"))
neg.score <- tm_term_score(TermDocumentMatrix(corpus, control = list(removePunctuation = TRUE)),
terms_in_General_Inquirer_categories("Negativ"))
plot(pos.score, neg.score)
hist(pos.score-neg.score)
df$sentiment <- pos.score-neg.score # normalize?
plot(sentiment ~ date, data=df)
fit <- loess(sentiment ~ days, data=df, span=0.2)
o <- order(fit$x)
lines(fit$x[o], fit$fitted[o], col='red', lwd=2)
plot(rating ~ sentiment, data=df)
boxplot(sentiment ~ rating, data=df)