# contrasts set
options(contrasts=c("contr.treatment", "contr.poly"))


# Objective: can we characterise new users by demographics?
# This command will read in the uva dataset.

uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na"))

### Identify missing values on the continuous age variable and ditch them
miss.age <- is.na(uva$Age)
uva <- uva[!miss.age,]
attach(uva)
dim(uva)
summary(uva)

uva.miss <- is.na(uva)
uva[uva.miss] <- "MISSING"

# This command randomly chooses rows to be in the training dataset
training <- sample(15432,7716)

# and this one defines the validation rows as those that were not in
# the training one.
validation <- c(1:15432)[-training]

# This command does logistic regression
glm.out <- glm(Newbie ~ Age + Age^2 + Household.Income + Major.Occupation, 
family=binomial,data=uva,subset=training)

#Here is how you predict on the validation data set
#predict.glm(glm.out,uva[validation,],type="response")

#Now we turn these probability predictions into a 
#simple YES/NO prediction.

outsample.pred <- ifelse(
predict.glm(glm.out,uva[validation,],type="response")>.5,1,0)