### Lasso #----------------------------------------------------------------------- # # Lasso with Cross-validation, osteo data # # cleaned, categoricals already converted to numeric dummy vars # see model.matrix command otherwise (illustrated in second example) # missing filled with dummy variables added # #----------------------------------------------------------------------- # # for tighter plot margins reset <- function (mfrow=c(1,1)) { par(mfrow=mfrow, mgp=c(2,1,0),mar=c(3,3,3,1)) # bottom left top right } reset() library(glmnet) Osteo <- read.csv("/Users/bob/courses/mich/data_mining/data/osteo_big.csv"); colnames(Osteo) # ZHIP, AGE, WEIGHT, YR_POST, ..., Miss.WEIGHT dim(Osteo) # 1232 x 209 Osteo <- na.omit(Osteo) # confirm no missing data x=model.matrix(ZHIP~.,Osteo)[,-1] # build data matrix with all but ZHip y=Osteo$ZHIP # --- prepare for CV set.seed(1) train=sample(1:nrow(x), floor(0.75 * nrow(x))) test=(-train) # negative means don't use y.test=y[test] # --- Lasso grid=10^seq(10,-2,length=100) # values of lambda lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid) plot(lasso.mod) set.seed(1) cv.out=cv.glmnet(x[train,],y[train],alpha=1) plot(cv.out) bestlam=cv.out$lambda.min lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,]) mean((lasso.pred-y.test)^2) out=glmnet(x,y,alpha=1,lambda=grid) # lasso coefs sparse lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,] lasso.coef; sum(lasso.coef==0) lasso.coef[lasso.coef!=0]