Bagging and Boosting - Incomplete confusion matrix

pksaibvn · September 1, 2020, 1:58pm

Hi,

I am trying to workout Bagging and Boosting. When I predict the model, the confusion matrix is incomplete.

R Code below:

my_data = ISLR::Caravan

# New Dataset - post multi-collinearity check
my_data_new = my_data[,c(2,3,4,6,8,9,11,12,13,14,16,17,20,21,23,24,26,27,29,33,40,41,42,43,45,47,49,55,63,66,68,70,76,80,82,85,86)]

# Normal split
set.seed(44)
split = sample.split(my_data_new$Purchase, SplitRatio = 0.7)
train.data = subset(my_data_new, split == T)
test.data = subset(my_data_new, split == F)

# Balanced Split - SMOTE method
set.seed(44)
table(train.data$Purchase)
train.data_bal = SMOTE(Purchase~. ,
                         train.data, perc.over = 100, k = 5, perc.under = 1570)

test.data_bal = SMOTE(Purchase~. ,
                       test.data, perc.over = 100, k = 5, perc.under = 1579)

# XGBOOST

# Convert dependent variable to numeric
train.data$Purchase = as.numeric(train.data$Purchase)
train.data_bal$Purchase = as.numeric(train.data_bal$Purchase)

# with Base data
XGBO_base_model = xgboost(data = as.matrix(train.data[,-37]), 
                           label = as.matrix(train.data$Purchase), 
                           nrounds = 10)

XGBO_base_Pred = predict(XGBO_base_model, newdata = as.matrix(test.data[,-37]))
XGBO_base_Pred = XGBO_base_Pred >= 0.5
CM_XGBO_base = table(test.data[,37], XGBO_base_Pred)
CM_XGBO_base

# with Balanced (SMOTE) data
XGBO_bal_model = xgboost(data = as.matrix(train.data_bal[,-37]), 
                     label = train.data_bal$Purchase, 
                     nrounds = 10)

XGBO_bal_Pred = predict(XGBO_bal_model, newdata = as.matrix(test.data_bal[,-37]))
XGBO_bal_Pred = XGBO_bal_Pred >= 0.5
CM_XGBO_bal = table(test.data_bal[,37], XGBO_bal_Pred)
CM_XGBO_bal

# Bagging

# with Base data
Bag_base_model = bagging(Purchase~. ,
                          data = train.data, 
                          control=rpart.control(maxdepth = 5, minsplit = 15))
Bag_base_Pred = predict(Bag_base_model, test.data)
Bag_base_Pred = Bag_base_Pred >=0.5
CM_Bag_base = table(test.data$Purchase, Bag_base_Pred)
CM_Bag_base

# with Balanced (SMOTE) data
Bag_bal_model = bagging(Purchase~. ,
                          data = train.data_bal, 
                          control=rpart.control(maxdepth = 5, minsplit = 15))
Bag_bal_Pred = predict(Bag_bal_model, test.data_bal)
Bag_bal_Pred = Bag_bal_Pred >= 0.5
CM_Bag_bal = table(test.data_bal$Purchase, Bag_bal_Pred)
CM_Bag_bal

O/P for above Bagging and Boosting

> # with Base data
> XGBO_base_model = xgboost(data = as.matrix(train.data[,-37]), 
+                            label = as.matrix(train.data$Purchase), 
+                            nrounds = 10)
[1]	train-rmse:0.454954 
[2]	train-rmse:0.354860 
[3]	train-rmse:0.291465 
[4]	train-rmse:0.252883 
[5]	train-rmse:0.231523 
[6]	train-rmse:0.219472 
[7]	train-rmse:0.212959 
[8]	train-rmse:0.208529 
[9]	train-rmse:0.205309 
[10]	train-rmse:0.203850 
> XGBO_base_Pred = predict(XGBO_base_model, newdata = as.matrix(test.data[,-37]))
> XGBO_base_Pred = XGBO_base_Pred >= 0.5
> CM_XGBO_base = table(test.data[,37], XGBO_base_Pred)
> CM_XGBO_base
     XGBO_base_Pred
      TRUE
  No  1642
  Yes  104
> # with Balanced (SMOTE) data
> XGBO_bal_model = xgboost(data = as.matrix(train.data_bal[,-37]), 
+                      label = train.data_bal$Purchase, 
+                      nrounds = 10)
[1]	train-rmse:0.519760 
[2]	train-rmse:0.406803 
[3]	train-rmse:0.332881 
[4]	train-rmse:0.288540 
[5]	train-rmse:0.260969 
[6]	train-rmse:0.242171 
[7]	train-rmse:0.230656 
[8]	train-rmse:0.223236 
[9]	train-rmse:0.215971 
[10]	train-rmse:0.212421 
> XGBO_bal_Pred = predict(XGBO_bal_model, newdata = as.matrix(test.data_bal[,-37]))
> XGBO_bal_Pred = XGBO_bal_Pred >= 0.5
> CM_XGBO_bal = table(test.data_bal[,37], XGBO_bal_Pred)
> CM_XGBO_bal
     XGBO_bal_Pred
      TRUE
  No  1642
  Yes  208
> # with Base data
> Bag_base_model = bagging(Purchase~. ,
+                           data = train.data, 
+                           control=rpart.control(maxdepth = 5, minsplit = 15))
> Bag_base_Pred = predict(Bag_base_model, test.data)
> Bag_base_Pred = Bag_base_Pred >=0.5
> CM_Bag_base = table(test.data$Purchase, Bag_base_Pred)
> CM_Bag_base
     Bag_base_Pred
      TRUE
  No  1642
  Yes  104
> # with Balanced (SMOTE) data
> Bag_bal_model = bagging(Purchase~. ,
+                           data = train.data_bal, 
+                           control=rpart.control(maxdepth = 5, minsplit = 15))
> Bag_bal_Pred = predict(Bag_bal_model, test.data_bal)
> Bag_bal_Pred = Bag_bal_Pred >= 0.5
> CM_Bag_bal = table(test.data_bal$Purchase, Bag_bal_Pred)
> CM_Bag_bal
     Bag_bal_Pred
      TRUE
  No  1642
  Yes  208

If I use above dataset for model preparation, confusion matrix is proper. KNN output is below.

> # KNN - Base
> knn_Model_base = knn(train = train.data[,-37],
+                      test = test.data[,-37],
+                      cl = train.data[,37],k=1)
> CM_KNN_base = table(test.data$Purchase, knn_Model_base)
> CM_KNN_base
     knn_Model_base
        No  Yes
  No  1557   85
  Yes   95    9
> # KNN - with Balanced (SMOTE) data
> knn_Model_bal = knn(train.data_bal[,-37],
+                       test.data_bal[,-37],
+                       cl = train.data_bal[,37],k=1)
> CM_KNN_bal = table(test.data_bal$Purchase, knn_Model_bal)
> CM_KNN_bal
     knn_Model_bal
        No  Yes
  No  1480  162
  Yes  171   37

Since I am new to concept of bagging and boosting, not sure where I have made the mistake. I am only getting No/Yes for True condition where as in KNN I get a complete matrix.

Any help is appreciated. Thanks

Max · September 1, 2020, 5:49pm

Generally you would want your model to output the predictions in the same format as the original data. That data, in R is typically a factor. xgboost doesn't use that so you will have to convert it back. If you do this with an ifelse() to fix it.

TBH this is the reason that caret and tidymodels exist; some packages don't obey R conventions. These systems give you the same user-interface and outputs across R models. Additionally, the recipes and themis packages can help with preprocessing (which includes SMOTE and other subsampling methods).

pksaibvn · September 2, 2020, 1:06pm

Hi, thanks for the response. Since I am new to this topic, unable to follow up.

Can you give me some links to go thru or give me a code snippet to understand this more and how we can use the other packages.

thanks

Max · September 2, 2020, 6:41pm

Sure. There are examples and lots of tutorials at tidymodels.org. I recommend looking at the tab labeled Get Started. We have a book to (hopefully) release next week; I'll update this thread when that happens.

Below is an example script. One change I made was to not use SMOTE on the test set; you should only apply those types of operations to the data used for modeling.

library(tidymodels)
#> ── Attaching packages ──────────────────────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom     0.7.0      ✓ recipes   0.1.13
#> ✓ dials     0.0.8      ✓ rsample   0.0.7 
#> ✓ dplyr     1.0.2      ✓ tibble    3.0.3 
#> ✓ ggplot2   3.3.2      ✓ tidyr     1.1.2 
#> ✓ infer     0.5.2      ✓ tune      0.1.1 
#> ✓ modeldata 0.0.2      ✓ workflows 0.1.3 
#> ✓ parsnip   0.1.3      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ─────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(themis)
#> Attaching package: 'themis'
#> The following objects are masked from 'package:recipes':
#> 
#>     step_downsample, step_upsample, tunable.step_downsample,
#>     tunable.step_upsample
library(ISLR)

# split data

set.seed(112)
split <- initial_split(Caravan, strata = "Purchase")
caravan_train <- training(split)
caravan_test  <- testing(split)

# Without model tuning

# create a recipe to downsample the data
caravan_rec <- 
  recipe(Purchase ~ ., data = caravan_train) %>% 
  step_smote(Purchase) # <- Only on the data used for modeling **not the test data**

boost_spec <- 
  boost_tree(trees = 10) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

# Create a workflow to contain preprocessing and model

boost_wflow <- 
  workflow() %>% 
  add_model(boost_spec) %>% 
  add_recipe(caravan_rec)

boost_fit <- fit(boost_wflow, caravan_train)
boost_fit
#> ══ Workflow [trained] ════════════════════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: boost_tree()
#> 
#> ── Preprocessor ──────────────────────────────────────────────────────────────────────────────────────────
#> 1 Recipe Step
#> 
#> ● step_smote()
#> 
#> ── Model ─────────────────────────────────────────────────────────────────────────────────────────────────
#> ##### xgb.Booster
#> raw: 28.3 Kb 
#> call:
#>   xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
#>     colsample_bytree = 1, min_child_weight = 1, subsample = 1), 
#>     data = x, nrounds = 10, watchlist = wlist, verbose = 0, objective = "binary:logistic", 
#>     nthread = 1)
#> params (as set within xgb.train):
#>   eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.evaluation.log()
#> # of features: 85 
#> niter: 10
#> nfeatures : 85 
#> evaluation_log:
#>     iter training_error
#>        1       0.139253
#>        2       0.087458
#> ---                    
#>        9       0.035541
#>       10       0.033600

# Before using the test set, maybe resample the model instead. 

# Setup 10-fold cross-validation:
set.seed(99)
folds <- vfold_cv(caravan_train, v = 10, strata = "Purchase")

# some metrics to evaluate the model: 
my_metrics <- metric_set(roc_auc, pr_auc, accuracy, kap)

set.seed(12)
resampled <- 
  boost_wflow %>% 
  fit_resamples(folds, metrics = my_metrics)

# results: 
collect_metrics(resampled)
#> # A tibble: 4 x 5
#>   .metric  .estimator  mean     n std_err
#>   <chr>    <chr>      <dbl> <int>   <dbl>
#> 1 accuracy binary     0.914    10 0.00273
#> 2 kap      binary     0.124    10 0.0161 
#> 3 pr_auc   binary     0.976    10 0.00161
#> 4 roc_auc  binary     0.736    10 0.0104


# You can also tune the model using `tune_grid()` or `tune_bayes()`. 
# See https://www.tidymodels.org/start/tuning/

# When you move on to the test set
test_pred <- 
  caravan_test %>% 
  bind_cols(
    predict(boost_fit, new_data = caravan_test),
    predict(boost_fit, new_data = caravan_test, type = "prob")
  )

test_pred %>% conf_mat(truth = Purchase, estimate = .pred_class)
#>           Truth
#> Prediction   No  Yes
#>        No  1314   94
#>        Yes   38    9

test_pred %>% my_metrics(truth = Purchase, estimate = .pred_class, .pred_No)
#> # A tibble: 4 x 3
#>   .metric  .estimator .estimate
#>   <chr>    <chr>          <dbl>
#> 1 accuracy binary        0.909 
#> 2 kap      binary        0.0791
#> 3 roc_auc  binary        0.727 
#> 4 pr_auc   binary        0.971

^{Created on 2020-09-02 by the reprex package (v0.3.0)}

pksaibvn · September 3, 2020, 4:07am

Thank you @Max . appreciate it. This helps.

system · September 24, 2020, 4:07am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.