##################################################################### ### BMTRY 790: MACHINE LEARNING AND DATA MINING, Spring 2023 ### ### ### ### Lecture 3: Penalized Regression, Part I ### ### ### ### Body Fat Data Analysis ### ### ### ### Looking at linear regression, ridge regression, and lasso ### ### regression using the body fat data ### ##################################################################### library(MASS) library(leaps) bodyfat<-read.csv("H:/public_html/BMTRY790_Spring2023/Datasets/Body_fat.csv") #bodyfat<-bodyfat[,2:15] pairs(bodyfat, col="purple") mod13<-lm(PBF~., data=bodyfat) summary(mod13) ### Best subset, forward, and backward selection approaches subs_subset<-regsubsets(bodyfat[,2:14], bodyfat[,1], nvmax=13, method="exhaustive") subs_forward<-regsubsets(bodyfat[,2:14], bodyfat[,1], nvmax=13, method="forward") subs_backward<-regsubsets(bodyfat[,2:14], bodyfat[,1], nvmax=13, method="backward") #################################### ### Scaling and Centering Data ### #################################### ### For penalized regression, we want to center and scale the variables before fitting the model ### we can do this using the scale function in R (this subtracts the mean and divides by SD for all variables in a matrix bodyfat2<-scale(bodyfat) ### NOTE: this version scales the outcome as well. #bodyfat2<-cbind(bodyfat[,1], scale(bodyfat[,2:14])) ### If we only wanted to scale the covaraites bodyfat2<-as.data.frame(bodyfat2) ############################################################# ### Full Regression Model on scaled data (for comparison) ### ############################################################# mod13<-lm(PBF~., data=bodyfat2) summary(mod13) ######################################################## ### Fitting model usinf Forward Stagewise approach ### ######################################################## library(lars) mod_fsw<-lars(x=bodyfat2[,3:15],y=bodyfat2[,2], type="for") ### Can use abbreviation for forward stagewise summary(mod_fsw) par(mfrow=c(1,2)) plot(mod_fsw, breaks=F) plot(mod_fsw, breaks=F, plottype="Cp") round(mod_fsw$beta[10,], 3) ################################# ### Fitting Ridge Model ### ################################# ### Sequence of lambdas for consideration when fitting ridge model lam<-c(seq(0,9.99, by=.01),seq(10,99.9, by=.1),seq(101,10000, by=1)) ridgemod<-lm.ridge(PBF~., data=bodyfat2[,-1], lam=lam) select(ridgemod) ### Provides the value of lambda that generated the smalest GCV plot(ridgemod) ### Trace plot for the ridge models par(mfrow=c(1,2)) ### Trace plots with more control over parameters plot(x=0, y=0, xlim=c(0,10000),ylim=c(-0.3, 1.2), type="n", ylab="Coefficients", xlab="Lambda") for (i in 1:nrow(ridgemod$coef)) { if(i<9) lines(lam, ridgemod$coef[i,], col=i) if(i>8) lines(lam, ridgemod$coef[i,], col=(i-8), lty=2) } abline(v=1.18, lty=3, col="salmon", lwd=3) legend(6000, 1.2, colnames(bodyfat[,2:14]), col=c(1:8,1:5), lty=c(rep(1, 8), rep(2, 5)), lwd=2, cex=0.9, bty="n") plot(x=0, y=0, xlim=c(0,25),ylim=c(-0.3, 1.2), type="n", ylab="Coefficients", xlab="Lambda") for (i in 1:nrow(ridgemod$coef)) { if(i<9) lines(lam, ridgemod$coef[i,], col=i, lwd=2) if(i>8) lines(lam, ridgemod$coef[i,], col=(i-8), lty=2, lwd=2) } abline(v=1.18, lty=3, col="salmon", lwd=3) ridgemod<-lm.ridge(PBF~., data=bodyfat2, lam=1.18) ridgemod