#-------------------------------------------------------------------------------------- # # wine_data: creating the data for analysis # #-------------------------------------------------------------------------------------- # --- install relevant packages install.packages('stringr', dependencies=TRUE) # uniform string manipulations # --- libraries used rm(list=ls()) # clean slate require(stringr) # --- file references relative to this path setwd("~/courses/mich/text_analytics/") source("R_scripts/text_utils.R") ls() # --- read big file 20888 = (187992)/9 = tasting notes # convert to basic text (without fancy apostrophes and such) # Setting encoding="UTF-8" did not fix the umlauts seen in wine varieties # Problem is that this is the format in the source text (grep for Rh in file) rawText <- readLines("~/data/text/wine/tasting_data.txt", encoding="UTF-8") length(rawText) length(rawText)/9 rawText[1:15] # --- convert text lines into data frame, dropping the divider line # other examples show how to read large file that won't fit memory (nrow <- length(rawText)/9) Wine <- matrix(rawText, ncol=9, byrow=T) dim(Wine) head(Wine) Wine <- Wine[,-1] # drop first column with the dashed line dim(Wine) colnames(Wine) <- c("id", "label", "description", "type", "alcohol", "location", "date", "rating") head(Wine) # --- add variety, points, price, vintage, alcohol and color to data frame # first do an example, then wrap into a function Wine[1:10,"type"] # color appears after a comma ?regexpr (regexpr(".*,", Wine[1:10,"type"])) # whatever comes before a comma (-1 implies absent) ?str_extract (str_extract(Wine[1:10,"type"], ".*,")) # whatever comes before a comma (NA implies absent) # --- variety comes before a comma variety <- str_extract(Wine[,"type"], ".*,") length(variety) head(variety) variety <- str_replace(variety,",$","") # remove trailing comma (dont need $) head(variety) sum( is.na(variety) ) # 346 missing length(tab <- table(variety)) # Lots... 289! sort(tab, decreasing=T) i <- str_locate(variety,"&") # fix those weird characters i <- which(!is.na(i[,1])) # indices of rows with weird & head(i) length(i) # Rhone, Fume, Rose (s <- variety[i[1]]) str_replace(s,"ô","o") # try first variety <- str_replace(variety, "ô","o") # some left table(variety) variety <- str_replace_all(variety, "ô","o") table(variety) i <- str_locate(variety,"&") # find rest length(i <- which(!is.na(i[,1]))) variety[i[1:42]] # ä or &e variety <- str_replace_all(variety, "è","e") variety <- str_replace_all(variety, "ä","a") variety <- str_replace_all(variety, "é","e") (tab <- table(variety)) length(tab) variety <- str_replace_all(variety, "Portugese","Portuguese") variety <- str_replace_all(variety, "Regional ","") (tab <- table(variety)) length(tab) # 275 left # color # str_sub substring # str_subset subset of inputs Wine[1,"type"] str_extract(Wine[1,"type"],"(Red|White)$") color <- str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)") table(color) color <- str_trim(str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)")) # \ss is space table(color) # 10587 7072 any(is.na(color)) sum(is.na(color)) # 3229 # points Wine[1:10,"rating"] points <- str_extract(Wine[,"rating"], "^[0-9]+ ") points[1:10] points <- as.numeric(points) points[1:10] hist(points, breaks=25, xlab="Wine Rating") reset reset() # alters plot margins hist(points, breaks=25, xlab="Wine Rating") # price listed on most; start with a few experiments Wine[1:10,"label"] price <- str_extract(Wine[1:10,"label"], "[$][0-9]+([.][0-9]+)*") # digit likes . price price <- str_extract(Wine[,"label"], "[$][0-9]+([.][0-9]+)*") price <- str_replace(price,"[$]","") # remove $ head(price) price <- as.numeric(price) hist(price) hist(log10(price)) range(price) sum(is.na(price)) range(price, na.rm=T) # not a parsing error (how to tell), but might be an error? i <- which.max(price) # 12875 Wine[i,] price[i] <- NA i <- which.max(price) # okay now price[i] # 571 Wine[i,] # vintage Wine[1:10,"label"] vintage <- as.numeric(str_extract(Wine[,"label"], " (1|2)[0-9]{3}")) # alternative is [:digit:] table(vintage) x <- as.numeric(vintage) i <- which.max(x) Wine[i,] vintage <- as.numeric(str_extract(Wine[,"label"], " (19|20)[0-9]{2}")) table(vintage) # alcohol Wine[1:10,"alcohol"] ?str_sub str_sub(Wine[1:100,"alcohol"],1,-2) # all but last alcohol <- as.numeric(str_sub(Wine[,"alcohol"],1,-2)) table(alcohol) Wine[alcohol>100,"alcohol"] alcohol[alcohol>100] <- NA alcohol[alcohol==0] <- NA min(alcohol, na.rm=T) i <- which(alcohol==1) # seem like real data? Wine[i,"alcohol"] Wine[i,] # description remove first sentence which tells color (remove "" below) description <- Wine[ 1:10,"description"]; description description <- Wine[ 5000+1:10,"description"]; description description <- Wine[10000+1:10,"description"]; description description <- Wine[15000+1:10,"description"]; description description <- str_trim(str_replace(Wine[,"description"], "^.+(hue|color|cast)[.]","")) description[1:5] description[15000+1:10] any(is.na(description)) i <- str_locate(description," red ") # some reds are left elsewhere length(i <- which(!is.na(i[,1]))) # 1013 # --- peek at conventional data plot(vintage,alcohol) sum(is.na(alcohol)) mean.na <- function(x) mean(x,na.rm=T) m <- tapply(alcohol, vintage, mean.na) plot(names(m), m, xlab="Vintage", ylab="Mean Alcohol") plot(tapply(color=="Red", vintage, mean.na)) r <- which(color=="Red") means <- tapply(alcohol[r], vintage[r], mean.na) year <- as.numeric(names(means)) plot(year, means) # --- length of description (rough count of space delimited) charLen <- str_length(description) # num chars charLen[1:10] hist(charLen, breaks=50) min(charLen) # 0 str_split("one two three four", " ") # nice str_split("", " ") # not so nice num_words <- function(s) length(str_split(s," ")[[1]]) # convenience description[1] len <- sapply(description[1:3], num_words) len <- sapply(description[1:3], num_words, USE.NAMES=FALSE) # easier len str(len) len <- unname(sapply(description[1:3], num_words)) # drop the names that come with it str(len) len <- unname(sapply(description, num_words)) fivenum(len) mean(len) # 32.3 words missing <- charLen==0 # boolean index sum(missing) description[missing] <- NA hist(len[!missing], breaks=50, main="Lengths of Tasting Notes", xlab="# words") mean(len[!missing]) # 32.8 boxplot(len ~ color) summary(lm(len ~ color)) # whites have shorter descriptions on average col <- ifelse(color=="Red",'red','gold') plot(len, points, col=col) i <- sample(1:nrow(Wine), 1000) plot(len[i], points[i], col=col[i]) # longer ratings come with higher points summary(lm(points ~ len * col)) # === save unified data frame (could remove some now as well) with # balanced portion reserved for validation later id <- Wine[,"id"] location <- Wine[,"location"] Wine[1:5,"date"] date <- as.Date(Wine[,"date"],"%b-%d-%Y") date[1:5] # replace the text matrix Wine <- data.frame(description, variety, vintage, color, alcohol, points, price, id, location, date) dim(Wine) # 20888 x 10 ###################################################################################### # # Collected here # ###################################################################################### require(stringr) # insert your own file path into the next command rawText <- readLines("~/data/text/wine/tasting_data.txt", encoding="UTF-8") length(rawText) # 187992 Wine <- matrix(rawText, ncol=9, byrow=T) Wine <- Wine[,-1] colnames(Wine) <- c("id", "label", "description", "type", "alcohol", "location", "date", "rating") dim(Wine) # 20888 8 variety <- str_extract(Wine[,"type"], ".*,") variety <- str_replace_all(variety, "ô","o") variety <- str_replace_all(variety, "è","e") variety <- str_replace_all(variety, "ä","a") variety <- str_replace_all(variety, "é","e") variety <- str_replace_all(variety, "Portugese","Portuguese") variety <- str_replace_all(variety, "Regional ","") color <- str_trim(str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)")) # \ss is space points <- as.numeric(str_extract(Wine[,"rating"], "^[0-9]+ ")) price <- str_extract(Wine[,"label"], "[$][0-9]+([.][0-9]+)*") price <- str_replace(price,"[$]","") price <- as.numeric(price) price[ which.max(price) ] <- NA vintage <- as.numeric(str_extract(Wine[,"label"], " (19|20)[0-9]{2}")) alcohol <- as.numeric(str_sub(Wine[,"alcohol"],1,-2)) alcohol[alcohol>100] <- NA alcohol[alcohol==0] <- NA description <- str_trim(str_replace(Wine[,"description"], "^.+(hue|color|cast)[.]","")) charLen <- str_length(description) # num chars num_words <- function(s) length(str_split(s," ")[[1]]) # convenience len <- unname(sapply(description, num_words)) # len <- sapply(description, num_words, USE.NAMES=FALSE) description[charLen==0] <- NA id <- Wine[,"id"] location <- Wine[,"location"] date <- as.Date(Wine[,"date"],"%b-%d-%Y") # replace the text matrix Wine <- data.frame(description, variety, vintage, color, alcohol, points, price, id, location, date, stringsAsFactors=FALSE) # ----- sanity checks length(table(Wine[,"variety"])) table(Wine[,"color"]) hist(Wine[,"points"]) hist(Wine[,"price"], na.rm=TRUE) table(Wine[,"vintage"]) table(Wine[,"alcohol"]) Wine[1:5,"description"] Wine[1:5,"date"] # ----- final clean up # remove those with no description missingDesc <- is.na(Wine$description) sum(missingDesc) # 379 Wine <- Wine[!missingDesc,] dim(Wine) # 20509 x 10 # take care of later 'outlier' problem Wine[Wine$id==192747,] # yuck (i <- which(Wine$id==192747)) Wine <- Wine[-i,] dim(Wine) # 20508 x 10 # insert your own path for the file save(Wine,file="R_scripts/Wine.Rdata") # optionally export # write.csv(Wine, "data/Wine.csv") ###################################################################################### ###################################################################################### #--------------------------------------------------------------------------- # # Tokenization and DTM # #--------------------------------------------------------------------------- install.packages('tm', dependencies=TRUE) # text mining require(tm) # --- recover saved data file (check dim 20508 x 10) load(file="~/courses/mich/text_analytics/R_scripts/Wine.Rdata") dim(Wine) # can use writeCorpus to save to file wine.corpus <- Corpus(VectorSource(Wine$description)); wine.corpus typeof(wine.corpus) length(wine.corpus) wine.corpus[[1]] inspect(wine.corpus[1:4]) wine.corpus[[1]]$content # --- first step of tokenization wine.corpus <- tm_map(wine.corpus, content_transformer(tolower)) wine.corpus[[1]]$content # document-term matrix is held in sparse form DTM <- DocumentTermMatrix(wine.corpus) dim(DTM) DTM inspect(DTM[1:5,1:5]) # punctuation 'attached' to nearby chars unless remove all or separate earlier findFreqTerms(DTM, lowfreq=5000) # fix with a simple script (tokenizer_extra.sh) or using string in r # Do you want to remove *all* punctuation??? require(stringr) txt <- 'This is a grappy-apple (super [yum] good) wine!.' gsub ( "([][{}()*&^%$#@.,;:!-])"," \\1 ",txt) # pick your own str_replace(txt,"([][{}()*&^%$#@.,;:!-])"," \\1 ") # not compatible regex gsub ( "([[:punct:]])", " \\1 ",txt) # use punct str_replace(txt,"([[:punct:]])", " \\1 ") # not quite the same gsub ( "([[:punct:]])", " _\\1_ ",txt) # use punct f <- content_transformer(function(cont) gsub("([[:punct:]])", " _\\1_ ", cont)) # note the _ wrapper wine.corpus <- tm_map(wine.corpus, f) wine.corpus[[1]]$content wine.corpus <- tm_map(wine.corpus, stripWhitespace) wine.corpus[[1]] str(wine.corpus[[1]]) wine.corpus[[1]]$content # new doc-term matrix (documents: 20508, terms: 5471) DTM <- DocumentTermMatrix(wine.corpus); DTM findFreqTerms(DTM, lowfreq=5000) findFreqTerms(DTM, lowfreq=4000) # stemming will change... does fruit==fruity? # --- additional possible tokenization... # wine.corpus <- tm_map(wine.corpus, function(x)removeWords(x,stopwords())) # wine.corpus <- tm_map(wine.corpus, removePunctuation) # wine.corpus <- tm_map(corpus, removeNumbers)) # wine.corpus <- tm_map(docs, removeWords, c("robert", "parker")) # optional stemming done later # tokenized.corpus <- tm_map(wine.corpus, stemDocument) # DTM <- DocumentTermMatrix(tokenized.corpus); DTM # needs to be sparse # findFreqTerms(DTM, lowfreq=5000) # findFreqTerms(DTM, lowfreq=4000) # --- distribution of word types: Zipf plot matrix of counts C freq <- sort(colSums(as.matrix(DTM)), decreasing=T) types <- names(freq) freq[1:5] types[1:5] length(types) # 5471 hist(freq, breaks=100) # incredibly skewed hist(log(freq), breaks=100) hist(log(log(freq)), breaks=100) zipf_plot(freq, n.label=10, n.fit=250) # --- associations (tm function) findAssocs(DTM, 'fruit' , 0.2) findAssocs(DTM, 'oak' , 0.2) findAssocs(DTM, 'tannin', 0.2) findAssocs(DTM, 'chewi' , 0.2) # --- rare types, oov table(freq[freq<6]) # 1 2 3 4 5 # 1808 653 366 272 186 names(freq[freq==1])[c(1:20,250:310)] names(freq[freq==2])[c(1:20,250:310)] names(freq[freq==3])[c(1:20,250:310)] # stemming helps remove OOV, but at cost? freq[ which(types=="cigar") ] freq[ which(types=="cigars") ] # also plural once # not very many in big picture sum(freq[freq>3]) # 755809 tokens for word types seen more than 3 times sum(freq[freq<=3]) # 4212 sum(freq[freq<=3])/sum(freq[freq>3]) # 1/2 of 1% # --- recode rare words in corpus (2827 of them) and build corpus & DTM length(oov <- types[freq<=3]) oov[1:50] insert_oov <- function (str) { ss <- str_split(str, " ")[[1]] ss[ss %in% oov] <- "OOV" str <- paste(ss[-length(ss)],collapse=" ") # dump extraeous "" at end return(str) } wine.corpus[[1]]$content insert_oov(wine.corpus[[1]]) oovStrings <- rep(" ",length(wine.corpus)) for(i in 1:length(oovStrings)) oovStrings[i] <- insert_oov(wine.corpus[[i]]) # surely faster way! length(oovStrings) # rebuild corpus oovCorpus <- Corpus(VectorSource(oovStrings)); oovCorpus oovCorpus[[1]]$content oovDTM <- DocumentTermMatrix(oovCorpus) oovDTM # 20508 x 2645 # sanity check freq <- sort(colSums(as.matrix(oovDTM)), decreasing=T) types <- names(freq) min(freq) # 4 # --- some of these names will bother R later, so fix them here names <- colnames(oovDTM) names[1:25] names <- rename_punctuation(names) # in text_utils.R names[1:25] colnames(oovDTM) <- names # --- save DTM and matrix of frequencies for later use save(oovDTM, file="R_scripts/oovDTM.Rdata") # <<<------------------- oovFreq <- as.matrix(oovDTM) o <- order(colSums(oovFreq), decreasing=T) oovFreq <- oovFreq[,o] save(oovFreq, file="R_scripts/oovFreq.Rdata") # <<<------------------- zipf_plot(colSums(oovFreq),n.label=10, n.fit=250) # no big change here total <- colSums(oovFreq) total[1:10] j <- which(colnames(oovFreq)=="oov") total[j] # 4208