# contrasts set options(contrasts=c("contr.treatment", "contr.poly")) # Objective: can we characterise new users by demographics? # This command will read in the uva dataset. uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na")) ### Identify missing values on the continuous age variable and ditch them miss.age <- is.na(uva$Age) uva <- uva[!miss.age,] attach(uva) dim(uva) summary(uva) uva.miss <- is.na(uva) uva[uva.miss] <- "MISSING" # This command randomly chooses rows to be in the training dataset training <- sample(15432,7716) # and this one defines the validation rows as those that were not in # the training one. validation <- c(1:15432)[-training] # This command does logistic regression glm.out <- glm(Newbie ~ Age + Age^2 + Household.Income + Major.Occupation, family=binomial,data=uva,subset=training) #Here is how you predict on the validation data set #predict.glm(glm.out,uva[validation,],type="response") #Now we turn these probability predictions into a #simple YES/NO prediction. outsample.pred <- ifelse( predict.glm(glm.out,uva[validation,],type="response")>.5,1,0)