options(contrasts=c("contr.treatment", "contr.poly"))

# Objective: can we characterise new users by demographics?


# Read in the UVA data set
uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na"))

# attach it

attach(uva)

# summarize it
summary(uva)

# How does Newbie status depend on Age?
table(Age,Newbie)

# Dimensions of the spreadsheet
dim(uva)

# Filter out missing values
uva.nomiss <- na.omit(uva)

# Check the dimensions again -- they should have shrunk
dim(uva.nomiss)

# This command randomly chooses rows to be in the training dataset
training <- sample(15432,7716)

# and this one defines the validation rows as those that were not in
# the training one.

validation <- c(1:15432)[-training]

# This command does logistic regression
glm.out <- glm(Newbie ~ Age, family=binomial,data=uva.nomiss,subset=training)


#Here is a more detailed model
glm.out <- glm(Newbie ~ poly(Age,3) * Gender * Household.Income, family=binomial,
data=uva.nomiss,subset=training)

#Here is how you predict on the validation data set
#predict.glm(glm.out,uva.nomiss[validation,],type="response")

#Now we turn these probability predictions into a 
#simple YES/NO prediction.

outsample.pred <- ifelse(
predict.glm(glm.out,uva.nomiss[validation,],type="response")>.5,1,0)

#Now pull of the validation dataset
validate.data <- as.matrix(uva.nomiss[validation,])

#Now compare the predictions wioth the true values
table(validate.data[,1],outsample.pred) 


#######
# For fun see how the in sample predictiosn do
#######
insample.pred <- ifelse(
predict.glm(glm.out,uva.nomiss[training,],type="response")>.5,1,0)

#Now pull of the training dataset
training.data <- as.matrix(uva.nomiss[training,])

#Now compare the predictions wioth the true values
table(training.data[,1],insample.pred)