# This sets up categorical variable encodings in a familiar way.
options(contrasts=c("contr.sum", "contr.poly"))

# Read in the UVA data set. Make sure that this function points
# at exactly where you have stored the data. If it were on the
# public directory you would need "C:/public/uva.txt" as the fist argument.

uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na"))

# attach it

attach(uva)

# summarize it
summary(uva)

# Dimensions of the spreadsheet
dim(uva)

# Filter out missing values
uva.nomiss <- na.omit(uva)

# Check the dimensions again -- they should have shrunk
dim(uva.nomiss)

# This command randomly chooses rows to be in the training dataset
training <- sample(15432,7716)

# and this one defines the validation rows as those that were not in
# the training one.

validation <- c(1:15432)[-training]

#Now pull of the validation dataset
validate.data <- as.matrix(uva.nomiss[validation,])


# This command does logistic regression
glm.out <- glm(Newbie ~ Education.Attainment + Age  * Gender  + 
               Household.Income * Sexual.Preference,
               family=binomial,data=uva.nomiss,subset=training)


#Now we turn these probability predictions into a
#simple YES/NO prediction.

outsample.pred <- ifelse(
predict.glm(glm.out,uva.nomiss[validation,],type="response")>.5,1,0)


#Now compare the predictions with the true values
table(validate.data[,1],outsample.pred)