#--------------------------------------------------------------------------------------
#
#    wine_data: creating the data for analysis
#
#--------------------------------------------------------------------------------------

# --- install relevant packages
install.packages('stringr',   dependencies=TRUE)		# uniform string manipulations

# --- libraries used
rm(list=ls())		# clean slate
require(stringr)

# --- file references relative to this path
setwd("~/courses/mich/text_analytics/")
source("R_scripts/text_utils.R")
ls()

# --- read big file 20888 = (187992)/9 = tasting notes
#     convert to basic text (without fancy apostrophes and such)
#     Setting encoding="UTF-8" did not fix the umlauts seen in wine varieties
#	Problem is that this is the format in the source text (grep for Rh in file)
rawText <- readLines("~/data/text/wine/tasting_data.txt", encoding="UTF-8") 
length(rawText) 
length(rawText)/9
rawText[1:15]

# --- convert text lines into data frame, dropping the divider line
#		other examples show how to read large file that won't fit memory
(nrow <- length(rawText)/9)
Wine <- matrix(rawText, ncol=9, byrow=T)
dim(Wine)
head(Wine)
Wine <- Wine[,-1]						# drop first column with the dashed line
dim(Wine)
colnames(Wine) <- c("id", "label", "description", "type", "alcohol", "location", "date", "rating")
head(Wine)

# --- add variety, points, price, vintage, alcohol and color to data frame
#     first do an example, then wrap into a function

Wine[1:10,"type"]						# color appears after a comma

?regexpr
(regexpr(".*,", Wine[1:10,"type"]))			# whatever comes before a comma (-1 implies absent)

?str_extract
(str_extract(Wine[1:10,"type"], ".*,"))			# whatever comes before a comma (NA implies absent)

# --- variety comes before a comma
variety <- str_extract(Wine[,"type"], ".*,")
length(variety)
head(variety)			
variety <- str_replace(variety,",$","")				# remove trailing comma (dont need $)
head(variety)			
sum( is.na(variety) )							# 346 missing

length(tab <- table(variety))					# Lots... 289!
sort(tab, decreasing=T)

i <- str_locate(variety,"&")						# fix those weird characters
i <- which(!is.na(i[,1]))						# indices of rows with weird &
head(i)
length(i)									# Rhone, Fume, Rose
(s <- variety[i[1]])
str_replace(s,"&ocirc;","o")						# try first

variety <- str_replace(variety, "&ocirc;","o")		# some left
table(variety)
variety <- str_replace_all(variety, "&ocirc;","o")
table(variety)

i <- str_locate(variety,"&")						# find rest
length(i <- which(!is.na(i[,1])))
variety[i[1:42]]								# &auml; or &e
variety <- str_replace_all(variety, "&egrave;","e")
variety <- str_replace_all(variety, "&auml;","a")
variety <- str_replace_all(variety, "&eacute;","e")

(tab <- table(variety))
length(tab)
variety <- str_replace_all(variety, "Portugese","Portuguese")
variety <- str_replace_all(variety, "Regional ","")
(tab <- table(variety))
length(tab)									# 275 left


#     color
#				str_sub  		substring
#				str_subset		subset of inputs

Wine[1,"type"]
str_extract(Wine[1,"type"],"(Red|White)$")
color <- str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)")
table(color)
color <- str_trim(str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)"))	  # \ss is space
table(color)								# 10587   7072
any(is.na(color))
sum(is.na(color))							# 3229


#     points
Wine[1:10,"rating"]
points <- str_extract(Wine[,"rating"], "^[0-9]+ ")
points[1:10]
points <- as.numeric(points)
points[1:10]
hist(points, breaks=25, xlab="Wine Rating")
reset
reset()												# alters plot margins
hist(points, breaks=25, xlab="Wine Rating")


#     price listed on most;   start with a few experiments
Wine[1:10,"label"]
price <- str_extract(Wine[1:10,"label"], "[$][0-9]+([.][0-9]+)*")  # digit likes .
price
price <- str_extract(Wine[,"label"], "[$][0-9]+([.][0-9]+)*")
price <- str_replace(price,"[$]","")		# remove $
head(price)
price <- as.numeric(price)

hist(price)
hist(log10(price))
range(price)
sum(is.na(price))
range(price, na.rm=T)				# not a parsing error (how to tell), but might be an error?

i <- which.max(price)				# 12875
Wine[i,]
price[i] <- NA
i <- which.max(price)				# okay now
price[i]						# 571
Wine[i,]

#     vintage
Wine[1:10,"label"]
vintage <- as.numeric(str_extract(Wine[,"label"], " (1|2)[0-9]{3}")) # alternative is [:digit:]
table(vintage)
x <- as.numeric(vintage)
i <- which.max(x)
Wine[i,]
vintage <- as.numeric(str_extract(Wine[,"label"], " (19|20)[0-9]{2}"))
table(vintage)

#     alcohol
Wine[1:10,"alcohol"] 
?str_sub
str_sub(Wine[1:100,"alcohol"],1,-2)		# all but last
alcohol <- as.numeric(str_sub(Wine[,"alcohol"],1,-2))
table(alcohol)

Wine[alcohol>100,"alcohol"]

alcohol[alcohol>100] <- NA
alcohol[alcohol==0]  <- NA
min(alcohol, na.rm=T)
i <- which(alcohol==1)   # seem like real data?
Wine[i,"alcohol"]
Wine[i,]

#     description   remove first sentence which tells color  (remove "" below)
description <- Wine[      1:10,"description"]; description
description <- Wine[ 5000+1:10,"description"]; description
description <- Wine[10000+1:10,"description"]; description
description <- Wine[15000+1:10,"description"]; description

description <- str_trim(str_replace(Wine[,"description"], "^.+(hue|color|cast)[.]",""))
description[1:5]
description[15000+1:10]
any(is.na(description))

i <- str_locate(description," red ")					# some reds are left elsewhere
length(i <- which(!is.na(i[,1])))						# 1013


# --- peek at conventional data

plot(vintage,alcohol)
sum(is.na(alcohol))
mean.na <- function(x) mean(x,na.rm=T)
m <- tapply(alcohol, vintage, mean.na)
plot(names(m), m, xlab="Vintage", ylab="Mean Alcohol")

plot(tapply(color=="Red", vintage, mean.na))

r <- which(color=="Red")
means <- tapply(alcohol[r], vintage[r], mean.na)
year  <- as.numeric(names(means))
plot(year, means)


# --- length of description (rough count of space delimited)

charLen <- str_length(description)				# num chars
charLen[1:10]
hist(charLen, breaks=50)
min(charLen)								# 0

str_split("one two three four", " ")				# nice
str_split("", " ")							# not so nice

num_words <- function(s) length(str_split(s," ")[[1]]) 	# convenience
description[1]
len <- sapply(description[1:3], num_words)
len <- sapply(description[1:3], num_words, USE.NAMES=FALSE)	# easier
len
str(len)
len <- unname(sapply(description[1:3], num_words))		# drop the names that come with it
str(len)
len <- unname(sapply(description, num_words))	
fivenum(len)
mean(len)									# 32.3 words


missing <- charLen==0							# boolean index
sum(missing)
description[missing] <- NA
hist(len[!missing], breaks=50, main="Lengths of Tasting Notes", xlab="# words")
mean(len[!missing])							# 32.8

boxplot(len ~ color)
summary(lm(len ~ color))						# whites have shorter descriptions on average

col <- ifelse(color=="Red",'red','gold')
plot(len, points, col=col)

i <- sample(1:nrow(Wine), 1000)
plot(len[i], points[i], col=col[i])		# longer ratings come with higher points

summary(lm(points ~ len * col))

# === save unified data frame  (could remove some now as well) with
#     balanced portion reserved for validation later 

id   	 <- Wine[,"id"]
location <- Wine[,"location"]
Wine[1:5,"date"]
date 	 <- as.Date(Wine[,"date"],"%b-%d-%Y")
date[1:5]

# replace the text matrix
Wine <- data.frame(description, variety, vintage, color, alcohol, points, price, id, location, date)

dim(Wine)								# 20888 x 10

######################################################################################
#
#	Collected here
#
######################################################################################

require(stringr)

# insert your own file path into the next command
rawText <- readLines("~/data/text/wine/tasting_data.txt", encoding="UTF-8") 
length(rawText)   # 187992

Wine <- matrix(rawText, ncol=9, byrow=T)
Wine <- Wine[,-1]		
colnames(Wine) <- c("id", "label", "description", "type", "alcohol", "location", "date", "rating")
dim(Wine)   # 20888 8

variety <- str_extract(Wine[,"type"], ".*,")
variety <- str_replace_all(variety, "&ocirc;","o")
variety <- str_replace_all(variety, "&egrave;","e")
variety <- str_replace_all(variety, "&auml;","a")
variety <- str_replace_all(variety, "&eacute;","e")
variety <- str_replace_all(variety, "Portugese","Portuguese")
variety <- str_replace_all(variety, "Regional ","")

color <- str_trim(str_extract(Wine[,"type"],"(^|\\s)(Red|White)($|\\s)"))	  # \ss is space

points <- as.numeric(str_extract(Wine[,"rating"], "^[0-9]+ "))

price <- str_extract(Wine[,"label"], "[$][0-9]+([.][0-9]+)*")
price <- str_replace(price,"[$]","")
price <- as.numeric(price)
price[ which.max(price) ] <- NA

vintage <- as.numeric(str_extract(Wine[,"label"], " (19|20)[0-9]{2}"))

alcohol <- as.numeric(str_sub(Wine[,"alcohol"],1,-2))
alcohol[alcohol>100] <- NA
alcohol[alcohol==0]  <- NA

description <- str_trim(str_replace(Wine[,"description"], "^.+(hue|color|cast)[.]",""))
charLen <- str_length(description)				# num chars
num_words <- function(s) length(str_split(s," ")[[1]]) 	# convenience
len <- unname(sapply(description, num_words))	
# len <- sapply(description, num_words, USE.NAMES=FALSE)	
description[charLen==0] <- NA

id <- Wine[,"id"]

location <- Wine[,"location"]

date 	 <- as.Date(Wine[,"date"],"%b-%d-%Y")

# replace the text matrix
Wine <- data.frame(description, variety, vintage, color, alcohol, points, price, id, location, date,					stringsAsFactors=FALSE)

# -----  sanity checks

length(table(Wine[,"variety"]))
table(Wine[,"color"])
hist(Wine[,"points"])
hist(Wine[,"price"], na.rm=TRUE)
table(Wine[,"vintage"])
table(Wine[,"alcohol"])
Wine[1:5,"description"]
Wine[1:5,"date"]

# -----  final clean up

#		remove those with no description
missingDesc <- is.na(Wine$description)
sum(missingDesc)							# 379
Wine <- Wine[!missingDesc,]
dim(Wine)								# 20509 x 10

#		take care of later 'outlier' problem
Wine[Wine$id==192747,]	# yuck
(i <- which(Wine$id==192747))
Wine <- Wine[-i,]
dim(Wine)								# 20508 x 10

# insert your own path for the file
save(Wine,file="R_scripts/Wine.Rdata")	

# optionally export
# write.csv(Wine, "data/Wine.csv")
######################################################################################
######################################################################################

#---------------------------------------------------------------------------
#
#	Tokenization and DTM
#
#---------------------------------------------------------------------------

install.packages('tm',        dependencies=TRUE)		# text mining

require(tm)

# --- recover saved data file (check dim 20508 x 10)
load(file="~/courses/mich/text_analytics/R_scripts/Wine.Rdata")
dim(Wine)

#		can use writeCorpus to save to file
wine.corpus <- Corpus(VectorSource(Wine$description)); wine.corpus
typeof(wine.corpus)
length(wine.corpus)
wine.corpus[[1]]

inspect(wine.corpus[1:4])
wine.corpus[[1]]$content

# --- first step of tokenization
wine.corpus <- tm_map(wine.corpus, content_transformer(tolower))
wine.corpus[[1]]$content

# document-term matrix is held in sparse form
DTM <- DocumentTermMatrix(wine.corpus)
dim(DTM)
DTM   
inspect(DTM[1:5,1:5])
	
# punctuation 'attached' to nearby chars unless remove all or separate earlier
findFreqTerms(DTM, lowfreq=5000)

# fix with a simple script (tokenizer_extra.sh) or using string in r
# Do you want to remove *all* punctuation???
require(stringr)
txt <- 'This is a grappy-apple (super [yum] good) wine!.'
gsub       (    "([][{}()*&^%$#@.,;:!-])"," \\1 ",txt)	# pick your own
str_replace(txt,"([][{}()*&^%$#@.,;:!-])"," \\1 ")		# not compatible regex
gsub       (    "([[:punct:]])", " \\1 ",txt)			# use punct
str_replace(txt,"([[:punct:]])", " \\1 ")			# not quite the same

gsub       (    "([[:punct:]])", " _\\1_ ",txt)			# use punct

f <- content_transformer(function(cont) gsub("([[:punct:]])", " _\\1_ ", cont))	# note the _ wrapper
wine.corpus <- tm_map(wine.corpus, f)
wine.corpus[[1]]$content

wine.corpus <- tm_map(wine.corpus, stripWhitespace)
wine.corpus[[1]]
str(wine.corpus[[1]])
wine.corpus[[1]]$content


#		new doc-term matrix   (documents: 20508, terms: 5471)
DTM <- DocumentTermMatrix(wine.corpus); DTM
findFreqTerms(DTM, lowfreq=5000)
findFreqTerms(DTM, lowfreq=4000)				# stemming will change... does fruit==fruity?

# --- additional possible tokenization...
# wine.corpus <- tm_map(wine.corpus, function(x)removeWords(x,stopwords()))
# wine.corpus <- tm_map(wine.corpus, removePunctuation)
# wine.corpus <- tm_map(corpus, removeNumbers))
# wine.corpus <- tm_map(docs, removeWords, c("robert", "parker"))
# optional stemming done later

# tokenized.corpus <- tm_map(wine.corpus, stemDocument)

# DTM <- DocumentTermMatrix(tokenized.corpus); DTM	# needs to be sparse
# findFreqTerms(DTM, lowfreq=5000)
# findFreqTerms(DTM, lowfreq=4000)


# --- distribution of word types: Zipf plot   matrix of counts C
freq  <- sort(colSums(as.matrix(DTM)), decreasing=T)
types <- names(freq)
freq[1:5]
types[1:5]
length(types)	# 5471

hist(freq, breaks=100)		# incredibly skewed
hist(log(freq), breaks=100)
hist(log(log(freq)), breaks=100)

zipf_plot(freq, n.label=10, n.fit=250)

# --- associations  (tm function)

findAssocs(DTM, 'fruit' , 0.2)
findAssocs(DTM, 'oak'   , 0.2)
findAssocs(DTM, 'tannin', 0.2)
findAssocs(DTM, 'chewi' , 0.2)

# --- rare types, oov
table(freq[freq<6])

#    1    2    3    4    5 
# 1808  653  366  272  186 

names(freq[freq==1])[c(1:20,250:310)]
names(freq[freq==2])[c(1:20,250:310)]
names(freq[freq==3])[c(1:20,250:310)]

#		stemming helps remove OOV, but at cost?
freq[ which(types=="cigar") ] 
freq[ which(types=="cigars") ]  # also plural once

#		not very many in big picture
sum(freq[freq>3])				# 755809 tokens for word types seen more than 3 times
sum(freq[freq<=3])				#   4212
sum(freq[freq<=3])/sum(freq[freq>3])	# 1/2 of 1%

# --- recode rare words in corpus (2827 of them) and build corpus & DTM
length(oov <- types[freq<=3])

oov[1:50]

insert_oov <- function (str) {
	ss <- str_split(str, " ")[[1]]	
	ss[ss %in% oov] <- "OOV"	
	str <- paste(ss[-length(ss)],collapse=" ")  # dump extraeous "" at end
	return(str)
	}
	

wine.corpus[[1]]$content
insert_oov(wine.corpus[[1]])

oovStrings <- rep(" ",length(wine.corpus))
for(i in 1:length(oovStrings)) oovStrings[i] <- insert_oov(wine.corpus[[i]]) # surely faster way!
length(oovStrings)

# rebuild corpus
oovCorpus <- Corpus(VectorSource(oovStrings)); oovCorpus
oovCorpus[[1]]$content
oovDTM <- DocumentTermMatrix(oovCorpus)
oovDTM           								# 20508 x 2645

#		sanity check
freq  <- sort(colSums(as.matrix(oovDTM)), decreasing=T)
types <- names(freq)
min(freq)  										# 4

# --- some of these names will bother R later, so fix them here
names <- colnames(oovDTM)
names[1:25]
names <- rename_punctuation(names)				# in text_utils.R
names[1:25]
colnames(oovDTM) <- names

# --- save DTM and matrix of frequencies for later use
save(oovDTM, file="R_scripts/oovDTM.Rdata")		# <<<-------------------

oovFreq <- as.matrix(oovDTM)
o <- order(colSums(oovFreq), decreasing=T)
oovFreq <- oovFreq[,o]
save(oovFreq, file="R_scripts/oovFreq.Rdata") 	# <<<-------------------

zipf_plot(colSums(oovFreq),n.label=10, n.fit=250)	# no big change here

total <- colSums(oovFreq)
total[1:10]
j <- which(colnames(oovFreq)=="oov")
total[j]  # 4208