I created the following recipe to predict my random forest:
set.seed(123456)
cv_folds <- Data_train %>% vfold_cv(v = 4, strata = Lead_week)
# Create a recipe
rf_mod_recipe <- recipe(Lead_week ~ Jaar + Aantal + Verzekering + Leeftijd + Retentie +
Aantal_proeven + Geslacht + FLG_ADVERTISING + FLG_MAIL +
FLG_PHONE + FLG_EMAIL + Proef1 + Proef2 + Regio +
Month + AC,
data = Data_train) %>%
step_normalize(Leeftijd)
# Specify the recipe
rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 200) %>%
set_mode("regression") %>%
set_engine("ranger", importance = "permutation")
# Create a workflow
rf_mod_workflow <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_mod_recipe)
rf_mod_workflow
# State our error metrics
class_metrics <- metric_set(rmse, mae)
registerDoParallel()
rf_grid <- grid_regular(
mtry(range = c(5, 15)),
min_n(range = c(10, 200)),
levels = 5
)
rf_grid
# Train the model
set.seed(654321)
rf_tune_res <- tune_grid(
rf_mod_workflow,
resamples = cv_folds,
grid = rf_grid,
metrics = class_metrics
)
# Collect the optimal hyperparameters
rf_tune_res %>%
collect_metrics()
rf_tune_res %>%
collect_metrics() %>%
filter(.metric %in% c("rmse", "mae")) %>%
ggplot(aes(x = mtry, y = mean, ymin = mean - std_err, ymax = mean + std_err,
colour = .metric)) +
geom_errorbar() +
geom_line() +
geom_point() +
facet_grid(.metric ~., scales = "free_y")
# Plot of the train results
rf_tune_res %>%
collect_metrics() %>%
filter(.metric %in% c("rmse")) %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(x = mtry, y = mean, colour = min_n)) +
geom_line() +
geom_point() +
labs(y = "error")
# Select the best number of mtry
best_rmse <- select_best(rf_tune_res, "rmse")
rf_final_wf <- finalize_workflow(rf_mod_workflow, best_rmse)
rf_final_wf
# Create a workflow
rf_mod_workflow <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_mod_recipe)
rf_mod_workflow
predict(rf_final_wf, grid) %>%
bind_cols(rf_mod_recipe %>% select(AC)) %>%
ggplot(aes(y = .pred, x = AC)) +
geom_path()
Test set performance
# Finalise the workflow
set.seed(56789)
rf_final_fit <- rf_final_wf %>%
last_fit(splits, metrics = class_metrics)
# Collect predictions
summary_rf <- rf_final_fit %>%
collect_predictions()
summary(summary_rf$.pred)
# Collect metrics
rf_final_fit %>%
collect_metrics()
So I used cross-validation to finetune and eventually test on holdout data. However, how do I get partial dependence plots to 'open the black box'?