#################################################################################
###										               	###
### BMTRY 790: Machine Learning								###
### Spring 2023											###
###													###
### LECTURES 16/17: Fitting boosting models using adaboost, baggin models, 	###
### 	and random forest models.  For the 2-class problem using the treatment	###
###	response in lupus nephritis, for the multiclass problem using the 	###
###	breast tissue data									###
###										               	###
#################################################################################

### ADABOOST ###
library(adabag)
library(ada)
library(caret)

######################################################
###	Example of Boosted Ensemble Using AdaBoost   ###
###								   ###
### 		2-CLASS CLASSIFICATION EXAMPLE	   ###
###     TREATMENT RESPONSE IN LUPUS NEPHRITIS      ###
######################################################
LN<-read.csv("H:/public_html/BMTRY790_Spring2023/Datasets/LupusNephritis.csv")
ids1<-which(LN$CR90==1)
ids0<-which(LN$CR90==0)
LN$CR90<-as.factor(ifelse(LN$CR90==1, "Class1","Class0"))
set.seed(1234)
trn1<-sample(ids1, .67*length(ids1), replace=F)
trn0<-sample(ids0, .67*length(ids0), replace=F)
sub<-sort(c(trn1, trn0))

### First looking at adabag package for boosting
	### Choose model fitting parameters using caret (this is computationally slow...)
grid <- expand.grid(mfinal=c(100,250,50), maxdepth = c(2:6), coeflearn = "Freund")
trBoost1<-train(CR90~., data=LN, method="AdaBoost.M1", tuneGrid=grid, 
	trControl=trainControl(method = "cv", number = 10, classProbs = TRUE))
### Fitting model using selected parmeters
BoostMod1<-boosting(CR90~., data=LN[sub,], boos=F, coeflearn="Freund", mfinal=250, control=rpart.control(maxdepth=5))

	### Estimating the margins for the boosting model
margins(BoostMod1, LN[sub,])
trn.margins<-margins(BoostMod1, LN[sub, ])
tst.margins<-margins(predict(BoostMod1, newdata=LN[-sub,]), LN[-sub, ])

	###Looking at test error with increasing numbers of trees
trnerr.change<-errorevol(BoostMod1, LN[sub,])
tsterr.change<-errorevol(BoostMod1, LN[-sub,])
plot(tsterr.change$error, type="l", ylim=c(0, 0.6), main="Boosting Error vs. Number of Trees", 
	xlab="Iterations", ylab="Error", col=2, lwd=2)
lines(trnerr.change$error, col=4, lty=2, lwd=2)
legend("topleft", c("test", "train"), col=c(2,4), lty=1:2, lwd=2, bty="n")

	### Using the CV function to estimate test data error rate using FULL data
BoostMod1cv<-boosting.cv(CR90~., data=LN, v=10, boos=F, coeflearn="Freund", mfinal=250, control=rpart.control(maxdepth=5))



######################################################
###  Example of Boosted Ensemble Using ada package ###
######################################################
### First looking at adabag package for boosting
	### Choose model fitting parameters using caret (this is computationally slow...)
grid <- expand.grid(iter=c(100, 250, 500), maxdepth = c(2:6), nu=c(0.05,0.2,0.6,1))
trBoost2<-train(CR90~., data=LN, method="ada", tuneGrid=grid, 
	trControl=trainControl(method = "cv", number = 5, classProbs = TRUE))

### Fitting model using selected parmeters
BoostMod2<-ada(CR90~., data=LN[sub,], iter=250, nu=0.05, bag.frac=1, 
	test.x=LN[-sub,-11], test.y=LN[-sub,11], control=rpart.control(maxdepth=5))

table(predict(BoostMod2, newdata=LN[-sub,], type="vector"), LN$CR90[-sub])
plot(BoostMod2, TRUE, TRUE)
varplot(BoostMod2)






######################################################
###	 Example of Boosted Ensemble Using SAMME     ###
###								   ###
### 	    MULTI-CLASS CLASSIFICATION EXAMPLE	   ###
###       DIFFERENTIATING BREAST TISSUE TYPES      ###
######################################################
btis<-read.csv("H:\\public_html\\BMTRY790_Spring2023\\Datasets\\BreastTissue.csv")
btis<-btis[,-1]
set.seed(1234)
sub<-sort(sample(1:nrow(btis), 0.67*nrow(btis), replace=F))
grid <- expand.grid(mfinal=c(100,250,500), maxdepth = c(2:6), coeflearn = "Zhu")
trSamme1<-train(Class~., data=btis, method="AdaBoost.M1", tuneGrid=grid, 
	trControl=trainControl(method = "cv", number = 5, classProbs = TRUE))

SammeMod1<-boosting(Class~., data=btis[sub,], boos=F, coeflearn="Zhu", mfinal=250, control=rpart.control(maxdepth=5))
importanceplot(SammeMod1)

margins(SammeMod1, btis[sub,])
margins(predict(SammeMod1, btis[-sub,])

	###Looking at test error with increasing numbers of trees
prtrn<-predict(SammeMod1, newdata=btis[sub,]); prtrn$confusion
prtst<-predict(SammeMod1, newdata=btis[-sub,]); prtst$confusion
trnerr.change<-errorevol(SammeMod1, btis[sub,])
tsterr.change<-errorevol(SammeMod1, btis[-sub,])
plot(tsterr.change$error, type="l", ylim=c(0, 0.6), main="Boosting Error vs. Number of Trees", 
	xlab="Iterations", ylab="Error", col=2, lwd=2)
lines(trnerr.change$error, col=4, lty=2, lwd=2)
legend("topleft", c("test", "train"), col=c(2,4), lty=1:2, lwd=2, bty="n")







######################################################
###	Example of random forest using randomForest  ###
###								   ###
### 		2-CLASS CLASSIFICATION EXAMPLE	   ###
###     TREATMENT RESPONSE IN LUPUS NEPHRITIS      ###
######################################################
library(randomForest)

### Start by tuning model for number of predictors at each split
tuneRF(x=LN[,-11], y=LN[,11], mtryStart=2, ntreeTry=500, stepFactor=1.5, 
	improve=0.0005,trace=TRUE, plot=TRUE, doBest=FALSE)
RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=1000, mtry=3, importance=T)
plot(1:1000, RFmod$err.rate[,1], xlab="ntrees", ylab="OOB error",	main="Estimated from Model",
	pch=16, cex=0.5, col=2)
sz<-c(1:200)*5
ooberbysz<-c()
for (i in 1:200)
{
	RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=sz[i], mtry=3, importance=T, keep.inbag=T)
	ooberbysz<-append(ooberbysz, RFmod$err.rate[sz[i],1])
}
plot(sz, ooberbysz, xlab="ntrees", ylab="OOB error",	main="Estimated from Separate Models",
	pch=16, cex=0.5, col=4)
	
	### now that we’ve selected ntree and mtry let’s fit a model
RFmod<-randomForest(CR90 ~ ., data=LN, subset=sub, ntree=250, mtry=3, importance=T)
RFmod$importance
varImpPlot(RFmod)

	### Prediction Error on test Data
table(predict(RFmod, newdata=LN[-sub,]), LN$CR90[-sub])

	### Determining how many predictors are desirable
set.seed(1234)
RFcv1<-rfcv(trainx=LN[sub,-11], trainy=LN$CR90[sub], scale="log", cv.fold=10, step=0.9)
names(RFcv1)


######################################################
###	 Example of Boosted Ensemble Using SAMME     ###
###								   ###
### 	    MULTI-CLASS CLASSIFICATION EXAMPLE	   ###
###       DIFFERENTIATING BREAST TISSUE TYPES      ###
######################################################
tuneRF(x=btis[,-1], y=btis[,1], mtryStart=2, ntreeTry=500, stepFactor=1.5, 
	improve=0.0005,trace=TRUE, plot=TRUE, doBest=FALSE)

RFmod<-randomForest(Class~., data=btis, subset=sub, ntree=1000, mtry=3, importance=T)
plot(1:1000, RFmod$err.rate[,1], xlab="ntrees", ylab="OOB error",	main="Estimated from Model",
	pch=16, cex=0.5, col=2)

sz<-c(1:200)*5
ooberbysz<-c()
for (i in 1:200)
{
	RFmod<-randomForest(Class~., data=btis, subset=sub, ntree=sz[i], mtry=3, importance=T, keep.inbag=T)
	ooberbysz<-append(ooberbysz, RFmod$err.rate[sz[i],1])
}
plot(sz, ooberbysz, xlab="ntrees", ylab="OOB error",	main="Estimated from Separate Models",
	pch=16, cex=0.5, col=4)

	### now that we’ve selected ntree and mtry let’s fit a model
RFmod<-randomForest(Class ~ ., data=btis[sub,], ntree=500, mtry=3, importance=T)
names(RFmod)
round(RFmod$importance, digits=4)
varImpPlot(RFmod)