################################################################################# ### ### ### BMTRY 790: Machine Learning ### ### Spring 2023 ### ### ### ### LECTURES 16/17: Fitting boosting models using adaboost, baggin models, ### ### and random forest models. For the 2-class problem using the treatment ### ### response in lupus nephritis, for the multiclass problem using the ### ### breast tissue data ### ### ### ################################################################################# ### ADABOOST ### library(adabag) library(ada) library(caret) ###################################################### ### Example of Boosted Ensemble Using AdaBoost ### ### ### ### 2-CLASS CLASSIFICATION EXAMPLE ### ### TREATMENT RESPONSE IN LUPUS NEPHRITIS ### ###################################################### LN<-read.csv("H:/public_html/BMTRY790_Spring2023/Datasets/LupusNephritis.csv") ids1<-which(LN$CR90==1) ids0<-which(LN$CR90==0) LN$CR90<-as.factor(ifelse(LN$CR90==1, "Class1","Class0")) set.seed(1234) trn1<-sample(ids1, .67*length(ids1), replace=F) trn0<-sample(ids0, .67*length(ids0), replace=F) sub<-sort(c(trn1, trn0)) ### First looking at adabag package for boosting ### Choose model fitting parameters using caret (this is computationally slow...) grid <- expand.grid(mfinal=c(100,250,50), maxdepth = c(2:6), coeflearn = "Freund") trBoost1<-train(CR90~., data=LN, method="AdaBoost.M1", tuneGrid=grid, trControl=trainControl(method = "cv", number = 10, classProbs = TRUE)) ### Fitting model using selected parmeters BoostMod1<-boosting(CR90~., data=LN[sub,], boos=F, coeflearn="Freund", mfinal=250, control=rpart.control(maxdepth=5)) ### Estimating the margins for the boosting model margins(BoostMod1, LN[sub,]) trn.margins<-margins(BoostMod1, LN[sub, ]) tst.margins<-margins(predict(BoostMod1, newdata=LN[-sub,]), LN[-sub, ]) ###Looking at test error with increasing numbers of trees trnerr.change<-errorevol(BoostMod1, LN[sub,]) tsterr.change<-errorevol(BoostMod1, LN[-sub,]) plot(tsterr.change$error, type="l", ylim=c(0, 0.6), main="Boosting Error vs. Number of Trees", xlab="Iterations", ylab="Error", col=2, lwd=2) lines(trnerr.change$error, col=4, lty=2, lwd=2) legend("topleft", c("test", "train"), col=c(2,4), lty=1:2, lwd=2, bty="n") ### Using the CV function to estimate test data error rate using FULL data BoostMod1cv<-boosting.cv(CR90~., data=LN, v=10, boos=F, coeflearn="Freund", mfinal=250, control=rpart.control(maxdepth=5)) ###################################################### ### Example of Boosted Ensemble Using ada package ### ###################################################### ### First looking at adabag package for boosting ### Choose model fitting parameters using caret (this is computationally slow...) grid <- expand.grid(iter=c(100, 250, 500), maxdepth = c(2:6), nu=c(0.05,0.2,0.6,1)) trBoost2<-train(CR90~., data=LN, method="ada", tuneGrid=grid, trControl=trainControl(method = "cv", number = 5, classProbs = TRUE)) ### Fitting model using selected parmeters BoostMod2<-ada(CR90~., data=LN[sub,], iter=250, nu=0.05, bag.frac=1, test.x=LN[-sub,-11], test.y=LN[-sub,11], control=rpart.control(maxdepth=5)) table(predict(BoostMod2, newdata=LN[-sub,], type="vector"), LN$CR90[-sub]) plot(BoostMod2, TRUE, TRUE) varplot(BoostMod2) ###################################################### ### Example of Boosted Ensemble Using SAMME ### ### ### ### MULTI-CLASS CLASSIFICATION EXAMPLE ### ### DIFFERENTIATING BREAST TISSUE TYPES ### ###################################################### btis<-read.csv("H:\\public_html\\BMTRY790_Spring2023\\Datasets\\BreastTissue.csv") btis<-btis[,-1] set.seed(1234) sub<-sort(sample(1:nrow(btis), 0.67*nrow(btis), replace=F)) grid <- expand.grid(mfinal=c(100,250,500), maxdepth = c(2:6), coeflearn = "Zhu") trSamme1<-train(Class~., data=btis, method="AdaBoost.M1", tuneGrid=grid, trControl=trainControl(method = "cv", number = 5, classProbs = TRUE)) SammeMod1<-boosting(Class~., data=btis[sub,], boos=F, coeflearn="Zhu", mfinal=250, control=rpart.control(maxdepth=5)) importanceplot(SammeMod1) margins(SammeMod1, btis[sub,]) margins(predict(SammeMod1, btis[-sub,]) ###Looking at test error with increasing numbers of trees prtrn<-predict(SammeMod1, newdata=btis[sub,]); prtrn$confusion prtst<-predict(SammeMod1, newdata=btis[-sub,]); prtst$confusion trnerr.change<-errorevol(SammeMod1, btis[sub,]) tsterr.change<-errorevol(SammeMod1, btis[-sub,]) plot(tsterr.change$error, type="l", ylim=c(0, 0.6), main="Boosting Error vs. Number of Trees", xlab="Iterations", ylab="Error", col=2, lwd=2) lines(trnerr.change$error, col=4, lty=2, lwd=2) legend("topleft", c("test", "train"), col=c(2,4), lty=1:2, lwd=2, bty="n") ###################################################### ### Example of random forest using randomForest ### ### ### ### 2-CLASS CLASSIFICATION EXAMPLE ### ### TREATMENT RESPONSE IN LUPUS NEPHRITIS ### ###################################################### library(randomForest) ### Start by tuning model for number of predictors at each split tuneRF(x=LN[,-11], y=LN[,11], mtryStart=2, ntreeTry=500, stepFactor=1.5, improve=0.0005,trace=TRUE, plot=TRUE, doBest=FALSE) RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=1000, mtry=3, importance=T) plot(1:1000, RFmod$err.rate[,1], xlab="ntrees", ylab="OOB error", main="Estimated from Model", pch=16, cex=0.5, col=2) sz<-c(1:200)*5 ooberbysz<-c() for (i in 1:200) { RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=sz[i], mtry=3, importance=T, keep.inbag=T) ooberbysz<-append(ooberbysz, RFmod$err.rate[sz[i],1]) } plot(sz, ooberbysz, xlab="ntrees", ylab="OOB error", main="Estimated from Separate Models", pch=16, cex=0.5, col=4) ### now that we’ve selected ntree and mtry let’s fit a model RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=250, mtry=3, importance=T) RFmod$importance varImpPlot(RFmod) ### Prediction Error on test Data table(predict(RFmod, newdata=LN[-sub,]), LN$CR90[-sub]) ### Determining how many predictors are desirable set.seed(1234) RFcv1<-rfcv(trainx=LN[sub,-11], trainy=LN$CR90[sub], scale="log", cv.fold=10, step=0.9) names(RFcv1) ###################################################### ### Example of Boosted Ensemble Using SAMME ### ### ### ### MULTI-CLASS CLASSIFICATION EXAMPLE ### ### DIFFERENTIATING BREAST TISSUE TYPES ### ###################################################### tuneRF(x=btis[,-1], y=btis[,1], mtryStart=2, ntreeTry=500, stepFactor=1.5, improve=0.0005,trace=TRUE, plot=TRUE, doBest=FALSE) RFmod<-randomForest(Class~., data=btis, subset=sub, ntree=1000, mtry=3, importance=T) plot(1:1000, RFmod$err.rate[,1], xlab="ntrees", ylab="OOB error", main="Estimated from Model", pch=16, cex=0.5, col=2) sz<-c(1:200)*5 ooberbysz<-c() for (i in 1:200) { RFmod<-randomForest(Class~., data=btis, subset=sub, ntree=sz[i], mtry=3, importance=T, keep.inbag=T) ooberbysz<-append(ooberbysz, RFmod$err.rate[sz[i],1]) } plot(sz, ooberbysz, xlab="ntrees", ylab="OOB error", main="Estimated from Separate Models", pch=16, cex=0.5, col=4) ### now that we’ve selected ntree and mtry let’s fit a model RFmod<-randomForest(Class ~ ., data=btis[sub,], ntree=500, mtry=3, importance=T) names(RFmod) round(RFmod$importance, digits=4) varImpPlot(RFmod)