I added a recipe with a step_dummy(all_nominal_predictors()) and retuned and then refitted the model. The model seems to predict fine but the error still occurs when using augment with the trained workflow. The only factor remaining after the dummy step was "class" which is being predicted by the model.
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(tidyverse)
library(reprex)
data("hpc_data")
xgb_spec <- boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
spl <- initial_split(hpc_data)
training <- training(spl)
testing <- testing(spl)
hpc_folds <- vfold_cv(v = 5, training, strata = class)
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
size = 5
)
xgb_recipe <- recipe(class ~ ., data = training) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
training %>% glimpse()
#> Rows: 3,248
#> Columns: 8
#> $ protocol <fct> M, O, J, O, H, M, O, I, M, D, C, H, G, N, I, J, C, O, J, …
#> $ compounds <dbl> 96, 223, 535, 488, 223, 452, 363, 508, 211, 79, 3694, 70,…
#> $ input_fields <dbl> 334, 185, 69, 1417, 658, 704, 467, 3651, 996, 81, 13957, …
#> $ iterations <dbl> 20, 20, 20, 20, 20, 20, 50, 20, 20, 20, 20, 20, 20, 20, 2…
#> $ num_pending <dbl> 0, 0, 0, 5, 0, 0, 41, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,…
#> $ hour <dbl> 9.983333, 11.500000, 13.083333, 14.516667, 9.783333, 16.8…
#> $ day <fct> Fri, Tue, Sun, Mon, Mon, Tue, Mon, Tue, Mon, Tue, Mon, We…
#> $ class <fct> F, F, F, F, F, L, F, VF, VF, VF, L, M, M, VF, VF, M, M, F…
xgb_recipe %>% prep() %>% juice() %>% glimpse()
#> Rows: 3,248
#> Columns: 27
#> $ compounds <dbl> 96, 223, 535, 488, 223, 452, 363, 508, 211, 79, 3694, 70,…
#> $ input_fields <dbl> 334, 185, 69, 1417, 658, 704, 467, 3651, 996, 81, 13957, …
#> $ iterations <dbl> 20, 20, 20, 20, 20, 20, 50, 20, 20, 20, 20, 20, 20, 20, 2…
#> $ num_pending <dbl> 0, 0, 0, 5, 0, 0, 41, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,…
#> $ hour <dbl> 9.983333, 11.500000, 13.083333, 14.516667, 9.783333, 16.8…
#> $ class <fct> F, F, F, F, F, L, F, VF, VF, VF, L, M, M, VF, VF, M, M, F…
#> $ protocol_A <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
#> $ protocol_D <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_E <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_F <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
#> $ protocol_H <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_I <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
#> $ protocol_J <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
#> $ protocol_K <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_L <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_M <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ protocol_N <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
#> $ protocol_O <dbl> 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ day_Mon <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
#> $ day_Tue <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
#> $ day_Wed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
#> $ day_Thu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ day_Fri <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
#> $ day_Sat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ day_Sun <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
xgb_wf <- workflow() %>%
# add_formula(class ~ .) %>%
add_recipe(xgb_recipe) %>%
add_model(xgb_spec)
xgb_wf
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: boost_tree()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 1 Recipe Step
#>
#> • step_dummy()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Boosted Tree Model Specification (classification)
#>
#> Main Arguments:
#> trees = 1000
#> min_n = tune()
#> tree_depth = tune()
#>
#> Computational engine: xgboost
# Tune workflow ----
doParallel::registerDoParallel()
set.seed(123)
xgb_res <- tune_grid(
xgb_wf,
resamples = hpc_folds,
grid = xgb_grid
)
xgb_res
#> # Tuning results
#> # 5-fold cross-validation using stratification
#> # A tibble: 5 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [2597/651]> Fold1 <tibble [10 × 6]> <tibble [0 × 1]>
#> 2 <split [2598/650]> Fold2 <tibble [10 × 6]> <tibble [0 × 1]>
#> 3 <split [2598/650]> Fold3 <tibble [10 × 6]> <tibble [0 × 1]>
#> 4 <split [2599/649]> Fold4 <tibble [10 × 6]> <tibble [0 × 1]>
#> 5 <split [2600/648]> Fold5 <tibble [10 × 6]> <tibble [0 × 1]>
xgb_res %>% autoplot()
[image]
# Fit the best workflow to the training data ----
trained_wf <- xgb_wf %>%
finalize_workflow(
select_best(xgb_res, "roc_auc")
) %>%
fit(training)
#> [05:34:10] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
trained_wf
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: boost_tree()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 1 Recipe Step
#>
#> • step_dummy()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> ##### xgb.Booster
#> raw: 5.9 Mb
#> call:
#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 13L,
#> gamma = 0, colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 5L,
#> subsample = 1, objective = "multi:softprob"), data = x$data,
#> nrounds = 1000, watchlist = x$watchlist, verbose = 0, num_class = 4L,
#> nthread = 1)
#> params (as set within xgb.train):
#> eta = "0.3", max_depth = "13", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "5", subsample = "1", objective = "multi:softprob", num_class = "4", nthread = "1", validate_parameters = "TRUE"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.evaluation.log()
#> # of features: 26
#> niter: 1000
#> nfeatures : 26
#> evaluation_log:
#> iter training_mlogloss
#> 1 1.043522
#> 2 0.836352
#> ---
#> 999 0.010883
#> 1000 0.010877
# Predict on brand new data ----
brand_new_data <- hpc_data[5, -8]
brand_new_data
#> # A tibble: 1 x 7
#> protocol compounds input_fields iterations num_pending hour day
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 E 100 82 20 0 10.4 Fri
predict(trained_wf, new_data = brand_new_data)
#> # A tibble: 1 x 1
#> .pred_class
#> <fct>
#> 1 VF
# Augment the testing data causes an error
trained_wf %>%
augment(testing)
#> Error in xgboost::xgb.DMatrix(data = newdata, missing = NA): 'data' has class 'character' and length 29241.
#> 'data' accepts either a numeric matrix or a single filename.
Created on 2021-07-27 by the reprex package (v2.0.0)