Hi,
I am trying to workout Bagging and Boosting. When I predict the model, the confusion matrix is incomplete.
R Code below:
my_data = ISLR::Caravan
# New Dataset - post multi-collinearity check
my_data_new = my_data[,c(2,3,4,6,8,9,11,12,13,14,16,17,20,21,23,24,26,27,29,33,40,41,42,43,45,47,49,55,63,66,68,70,76,80,82,85,86)]
# Normal split
set.seed(44)
split = sample.split(my_data_new$Purchase, SplitRatio = 0.7)
train.data = subset(my_data_new, split == T)
test.data = subset(my_data_new, split == F)
# Balanced Split - SMOTE method
set.seed(44)
table(train.data$Purchase)
train.data_bal = SMOTE(Purchase~. ,
train.data, perc.over = 100, k = 5, perc.under = 1570)
test.data_bal = SMOTE(Purchase~. ,
test.data, perc.over = 100, k = 5, perc.under = 1579)
# XGBOOST
# Convert dependent variable to numeric
train.data$Purchase = as.numeric(train.data$Purchase)
train.data_bal$Purchase = as.numeric(train.data_bal$Purchase)
# with Base data
XGBO_base_model = xgboost(data = as.matrix(train.data[,-37]),
label = as.matrix(train.data$Purchase),
nrounds = 10)
XGBO_base_Pred = predict(XGBO_base_model, newdata = as.matrix(test.data[,-37]))
XGBO_base_Pred = XGBO_base_Pred >= 0.5
CM_XGBO_base = table(test.data[,37], XGBO_base_Pred)
CM_XGBO_base
# with Balanced (SMOTE) data
XGBO_bal_model = xgboost(data = as.matrix(train.data_bal[,-37]),
label = train.data_bal$Purchase,
nrounds = 10)
XGBO_bal_Pred = predict(XGBO_bal_model, newdata = as.matrix(test.data_bal[,-37]))
XGBO_bal_Pred = XGBO_bal_Pred >= 0.5
CM_XGBO_bal = table(test.data_bal[,37], XGBO_bal_Pred)
CM_XGBO_bal
# Bagging
# with Base data
Bag_base_model = bagging(Purchase~. ,
data = train.data,
control=rpart.control(maxdepth = 5, minsplit = 15))
Bag_base_Pred = predict(Bag_base_model, test.data)
Bag_base_Pred = Bag_base_Pred >=0.5
CM_Bag_base = table(test.data$Purchase, Bag_base_Pred)
CM_Bag_base
# with Balanced (SMOTE) data
Bag_bal_model = bagging(Purchase~. ,
data = train.data_bal,
control=rpart.control(maxdepth = 5, minsplit = 15))
Bag_bal_Pred = predict(Bag_bal_model, test.data_bal)
Bag_bal_Pred = Bag_bal_Pred >= 0.5
CM_Bag_bal = table(test.data_bal$Purchase, Bag_bal_Pred)
CM_Bag_bal
O/P for above Bagging and Boosting
> # with Base data
> XGBO_base_model = xgboost(data = as.matrix(train.data[,-37]),
+ label = as.matrix(train.data$Purchase),
+ nrounds = 10)
[1] train-rmse:0.454954
[2] train-rmse:0.354860
[3] train-rmse:0.291465
[4] train-rmse:0.252883
[5] train-rmse:0.231523
[6] train-rmse:0.219472
[7] train-rmse:0.212959
[8] train-rmse:0.208529
[9] train-rmse:0.205309
[10] train-rmse:0.203850
> XGBO_base_Pred = predict(XGBO_base_model, newdata = as.matrix(test.data[,-37]))
> XGBO_base_Pred = XGBO_base_Pred >= 0.5
> CM_XGBO_base = table(test.data[,37], XGBO_base_Pred)
> CM_XGBO_base
XGBO_base_Pred
TRUE
No 1642
Yes 104
> # with Balanced (SMOTE) data
> XGBO_bal_model = xgboost(data = as.matrix(train.data_bal[,-37]),
+ label = train.data_bal$Purchase,
+ nrounds = 10)
[1] train-rmse:0.519760
[2] train-rmse:0.406803
[3] train-rmse:0.332881
[4] train-rmse:0.288540
[5] train-rmse:0.260969
[6] train-rmse:0.242171
[7] train-rmse:0.230656
[8] train-rmse:0.223236
[9] train-rmse:0.215971
[10] train-rmse:0.212421
> XGBO_bal_Pred = predict(XGBO_bal_model, newdata = as.matrix(test.data_bal[,-37]))
> XGBO_bal_Pred = XGBO_bal_Pred >= 0.5
> CM_XGBO_bal = table(test.data_bal[,37], XGBO_bal_Pred)
> CM_XGBO_bal
XGBO_bal_Pred
TRUE
No 1642
Yes 208
> # with Base data
> Bag_base_model = bagging(Purchase~. ,
+ data = train.data,
+ control=rpart.control(maxdepth = 5, minsplit = 15))
> Bag_base_Pred = predict(Bag_base_model, test.data)
> Bag_base_Pred = Bag_base_Pred >=0.5
> CM_Bag_base = table(test.data$Purchase, Bag_base_Pred)
> CM_Bag_base
Bag_base_Pred
TRUE
No 1642
Yes 104
> # with Balanced (SMOTE) data
> Bag_bal_model = bagging(Purchase~. ,
+ data = train.data_bal,
+ control=rpart.control(maxdepth = 5, minsplit = 15))
> Bag_bal_Pred = predict(Bag_bal_model, test.data_bal)
> Bag_bal_Pred = Bag_bal_Pred >= 0.5
> CM_Bag_bal = table(test.data_bal$Purchase, Bag_bal_Pred)
> CM_Bag_bal
Bag_bal_Pred
TRUE
No 1642
Yes 208
If I use above dataset for model preparation, confusion matrix is proper. KNN output is below.
> # KNN - Base
> knn_Model_base = knn(train = train.data[,-37],
+ test = test.data[,-37],
+ cl = train.data[,37],k=1)
> CM_KNN_base = table(test.data$Purchase, knn_Model_base)
> CM_KNN_base
knn_Model_base
No Yes
No 1557 85
Yes 95 9
> # KNN - with Balanced (SMOTE) data
> knn_Model_bal = knn(train.data_bal[,-37],
+ test.data_bal[,-37],
+ cl = train.data_bal[,37],k=1)
> CM_KNN_bal = table(test.data_bal$Purchase, knn_Model_bal)
> CM_KNN_bal
knn_Model_bal
No Yes
No 1480 162
Yes 171 37
Since I am new to concept of bagging and boosting, not sure where I have made the mistake. I am only getting No/Yes for True condition where as in KNN I get a complete matrix.
Any help is appreciated. Thanks