# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Amazon ratings example, Blitzer XML format # # Build time series of sentiments for product ratings # in selected category # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - require(stringr) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Build first data frame using the R XML package # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - require(tm) require(XML) # need a wrapper for XML files (recursive structure needs root node) xmlTreeParse(" bob stine jane doe ", asText=T) xmlTreeParse(" bob stine jane doe ", asText=T) # --- sample file with ≈1000 lines from all.review fileName <- "~/data/text/blitzer/sorted_data/automotive/sample.txt" # & has special use in XML xmlTree <- xmlTreeParse(fileName) # apostrophe coded in way we don't want xmlTree <- xmlParse(fileName) # better root <- xmlRoot(xmlTree) # extract text xmlName(root) xmlSize(root) # number of children (reviews) root[[1]] # first review xmlName(root[[1]]) # review xmlSize(root[[1]]) # 11 elements sapply(xmlChildren(root[[1]]), xmlName) # recursive data structure df <- xmlToDataFrame(xmlTree,stringsAsFactors=F) # also xmlToList df <- xmlToDataFrame(fileName, stringsAsFactors=F) # bypass object dim(df) names(df) head(df) # padded by \n df$asin df$rating # will need to convert to numbers df$date # --- Move to complete file, direct to data frame # clean.txt required some manual editing to remove odd characters # such as "snake eyes" that R and XML package would not read correctly. # XML format does not allow certain characters in the text. See the Makefile cleanFileName <- "~/data/text/blitzer/sorted_data/automotive/clean.txt" df <- xmlToDataFrame(cleanFileName, stringsAsFactors=F) # direct to data frame dim(df) names(df) df$rating <- as.numeric(df$rating) # happy table(df$rating) # polarized! d <- as.Date(df$date) # not so happy d <- df$date d <- str_replace(d,'\n','') d <- str_replace_all(d,'\n','') d <- as.Date(d) # not so happy df$date <- as.Date(d,format="%B %d, %Y") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Alternative is to "roll your own" parsing routines # Might be easier than learning special package such as # XML and is also more flexible. # Will you need it again? What else does package do? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rawFileName <- "~/data/text/blitzer/sorted_data/automotive/all.review" readLines(rawFileName, encoding="UTF-8", n=100) # --- use file connection to read (don't need here, but would with really big file" fl <- file(rawFileName, open='r', encoding="UTF-8") showConnections() fl <- file(rawFileName, open='r', encoding="UTF-8") scan(fl,what='character',nlines=2) # first two items scan(fl,what='character',nlines=10) # next 10 items separated by "whitespace" scan(fl,what='character',nlines=10, sep='\n') # next 10 items separated by "whitespace" close(fl) txtData <- readLines(rawFileName) length(txtData) # 26602 "abc" == "abc" "abc" == "adc" length(iRating <- which(txtData == "")) rating <- txtData[iRating+1] length(rating) # sanity check rating <- as.numeric(rating) mean(rating) table(rating) extract_text <- function(tag) { # leave big text array as global to avoid copying fileTag <- paste0("<",tag,">") i <- which(txtData == fileTag) cat(length(i), "lines match", fileTag, "\n") return(txtData[i+1]) } names <- c("date", "rating", "review_text") data <- lapply(names, extract_text) Data <- data.frame(data) colnames(Data) <- names # --- be very careful converting to numeric from factor str(Data) # check format Data[,2] <- as.numeric(Data[,2]) table(Data[,2]) # What happened to the 5's? # --- keep strings as strings Data <- data.frame(data, stringsAsFactors = FALSE) colnames(Data) <- names str(Data) Data[,2] <- as.numeric(Data[,2]) table(Data[,2]) # The fives are back! Data[1,1] as.Date(Data[1,1]) as.Date(Data[1,1],format="%B %d, %Y") Data[53,1] # Hummm... missing? Data[53,1] <- "August 1, 2001" Data[,1] <- as.Date(Data[,1],format="%B %d, %Y") any(is.na(Data)) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Basic exploratory data analysis # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - hist(Data$date) hist(Data$date, breaks=12) # --- plot ratings on date plot(rating ~ date, data=Data) fit <- loess(rating ~ date, data=Data) Data$days <- as.numeric(Data$date) fit <- loess(rating ~ days, data=Data) lines(fit) # oops! lines(Data$date, fitted.values(fit)) plot(rating ~ date, data=Data) o <- order(fit$x) lines(fit$x[o], fit$fitted[o]) # --- plot ratings on length Data$length <- sapply(Data$review_text, str_length) plot(rating ~ length, data=Data) # --- plot length on date plot(length ~ date, data=Data, log='y') Data $logLength <- log(Data $length) fit <- loess(logLength ~ days, data= Data, span=0.3) o <- order(fit$x) lines(fit$x[o], exp(fit$fitted[o]), col='red', lwd=2) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # NLP tools # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - require(tm) require(NLP) require(openNLP) require(openNLPmodels.en) Data[1,"review_text"] ann <- annotate(Data[1,"review_text"], list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator(), Maxent_Entity_Annotator(kind='organization')) ) # did not recognize ann ann <- annotate(Data[1,"review_text"], list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator(), Maxent_POS_Tag_Annotator()) ) # did not recognize ann unique(ann$type) sent <- subset(ann, type=="sentence") sapply(sent$features, str_length) # interpret components sent$features[1] # see doc for AnnotatedPlainTextDocument for further information doc <- AnnotatedPlainTextDocument(Data[1,"review_text"], ann) sents(doc) tagged_sents(doc) tagged_sents(doc, map = Universal_POS_tags_map) # easier to recognize tagged_words(doc) parsed_sents(doc) # error? # distribution of POS tags for word tokens wrds <- subset(ann, type == "word"); wrds # compare to text tags <- sapply(wrds$features, function(x) return(x[["POS"]]) ) table(tags) # --- plot sentiment of text on time, vs rating require(tm.lexicon.GeneralInquirer) corpus <- VCorpus(VectorSource(df$review_text)) pos.score <- tm_term_score(TermDocumentMatrix(corpus, control = list(removePunctuation = TRUE)), terms_in_General_Inquirer_categories("Positiv")) neg.score <- tm_term_score(TermDocumentMatrix(corpus, control = list(removePunctuation = TRUE)), terms_in_General_Inquirer_categories("Negativ")) plot(pos.score, neg.score) hist(pos.score-neg.score) df$sentiment <- pos.score-neg.score # normalize? plot(sentiment ~ date, data=df) fit <- loess(sentiment ~ days, data=df, span=0.2) o <- order(fit$x) lines(fit$x[o], fit$fitted[o], col='red', lwd=2) plot(rating ~ sentiment, data=df) boxplot(sentiment ~ rating, data=df)