options(contrasts=c("contr.treatment", "contr.poly")) # Objective: can we characterise new users by demographics? # Read in the UVA data set uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na")) # attach it attach(uva) # summarize it summary(uva) # How does Newbie status depend on Age? table(Age,Newbie) # Dimensions of the spreadsheet dim(uva) # Filter out missing values uva.nomiss <- na.omit(uva) # Check the dimensions again -- they should have shrunk dim(uva.nomiss) # This command randomly chooses rows to be in the training dataset training <- sample(15432,7716) # and this one defines the validation rows as those that were not in # the training one. validation <- c(1:15432)[-training] # This command does logistic regression glm.out <- glm(Newbie ~ Age, family=binomial,data=uva.nomiss,subset=training) #Here is a more detailed model glm.out <- glm(Newbie ~ poly(Age,3) * Gender * Household.Income, family=binomial, data=uva.nomiss,subset=training) #Here is how you predict on the validation data set #predict.glm(glm.out,uva.nomiss[validation,],type="response") #Now we turn these probability predictions into a #simple YES/NO prediction. outsample.pred <- ifelse( predict.glm(glm.out,uva.nomiss[validation,],type="response")>.5,1,0) #Now pull of the validation dataset validate.data <- as.matrix(uva.nomiss[validation,]) #Now compare the predictions wioth the true values table(validate.data[,1],outsample.pred) ####### # For fun see how the in sample predictiosn do ####### insample.pred <- ifelse( predict.glm(glm.out,uva.nomiss[training,],type="response")>.5,1,0) #Now pull of the training dataset training.data <- as.matrix(uva.nomiss[training,]) #Now compare the predictions wioth the true values table(training.data[,1],insample.pred)