# This sets up categorical variable encodings in a familiar way. options(contrasts=c("contr.sum", "contr.poly")) # Read in the UVA data set. Make sure that this function points # at exactly where you have stored the data. If it were on the # public directory you would need "C:/public/uva.txt" as the fist argument. uva <- read.table("uva.txt",sep="\t",header=T,na.strings=c("NA","na")) # attach it attach(uva) # summarize it summary(uva) # Dimensions of the spreadsheet dim(uva) # Filter out missing values uva.nomiss <- na.omit(uva) # Check the dimensions again -- they should have shrunk dim(uva.nomiss) # This command randomly chooses rows to be in the training dataset training <- sample(15432,7716) # and this one defines the validation rows as those that were not in # the training one. validation <- c(1:15432)[-training] #Now pull of the validation dataset validate.data <- as.matrix(uva.nomiss[validation,]) # This command does logistic regression glm.out <- glm(Newbie ~ Education.Attainment + Age * Gender + Household.Income * Sexual.Preference, family=binomial,data=uva.nomiss,subset=training) #Now we turn these probability predictions into a #simple YES/NO prediction. outsample.pred <- ifelse( predict.glm(glm.out,uva.nomiss[validation,],type="response")>.5,1,0) #Now compare the predictions with the true values table(validate.data[,1],outsample.pred)