Hello,
I'm presently trying to fit a random forest model with hyperparameter tuning using the tidymodels framework on a dataframe with 101,064 rows and 64 columns. I have a mix of categorical and continuous predictors and my outcome variable is a categorical variable with 3 categories so I have a multiclass classification problem.
The problem I'm having is that this process, even with parallel processing, is taking roughly 6 to 8 hours to complete on my machine. Since 101,064 isn't a huge amount of data, I suspect I'm not doing something correctly or efficiently (or both!). Unfornately I can't share the exact dataset due to confidentiality but the code I've shared below offers a very close replica of the original dataset from the number of levels in each categorical variable to the number of NA's present in each column.
I have some remarks about the code below that may give an insight into why I did what I did. Firstly, I split training and test sets based on a group id and not on rows. The dataset is nested where there are multiple rows that correspond to the same group id. Ideally, I would like a model that can learn patterns across group ids. Hence there ought to be no common group ids between the training and testing folds and no common group ids between the analysis and assessment folds in the cross validation folds.
Secondly, I've included step_unknown
because Random Forest does not like NA values. I've included step_novel
as a safeguard in case future data has categorical levels the current data has not seen. I'm not sure of when to use step_unknown
vs step_novel
and I'm not sure if it is wise to use them together so any clarification would be much appreciated. I've used step_other
and step_dummy
to One Hot Encode the categorical predictors. step_impute_median
has been included to not have NAs in the data to prevent Random Forest from complaining. step_downsample
has been used to deal with class imbalance in the outcome variable, I've used downsampling in an effort to have fewer observations in the model building step but it doesn't seem to have reduced training time.
My questions are:
-
Is there a reason the model tuning takes approximately 6 hours and is this something I can optimise further? I'm open to using dimensionality reduction and would appreciate some tutorials for doing this as part of a supervised ML pipeline using the tidymodels framework.
-
Have I specified and used the recipes correctly? It's something I'm not too sure about. I've mentioned above what I think I'm doing but is this actually what I'm doing and is it the best way to go about it? I'm open to reformulating the recipes step.
Any help on this would be much appreciated. I'm new to tidymodels so I apologise for any silly errors detected. If I can get this working then it may help me switch our modelling from sklearn to tidymodels. So if the training times are faster then this may be a winner :).
I'm running this code on my local machine which is a MacBook Pro with a 2.4 GHz, 8-Core processor and with 32GB memory.
library(tidyverse)
library(tidymodels)
library(themis)
library(finetune)
library(doParallel)
library(parallel)
library(ranger)
library(future)
library(doFuture)
# Create Synthetic data that closely mimics actual dataset ----
## Categorical predictors
categorical_predictor1 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5"), times = c(43281, 29088, 9881, 8874, 9940))
categorical_predictor2 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5"), times = c(2522, 21302, 20955, 36859, 19426))
categorical_predictor3 <- rep(c("cat1", "cat2"), times = c(15950, 85114))
categorical_predictor4 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7"), times = c(52023, 16666, 13662, 7045, 2644, 1798, 7226))
categorical_predictor5 <- rep(c("cat1", "cat2", "cat3"), times = c(52613, 14903, 33548))
categorical_predictor6 <- rep(c("cat1", "cat2", "cat3", "cat4"), times = c(13662, 16666, 18713, 52023))
categorical_predictor7 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", NA), times = c(44210, 11062, 8846, 4638, 1778, 4595, 25935))
categorical_predictor8 <- rep(c("cat1", "cat2", "cat3", "cat4", NA), times = c(11062, 8846, 11011, 44210, 25935))
categorical_predictor9 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", NA), times = c(11649, 10215, 9783, 7580, 5649, 30253, 25935))
categorical_predictor10 <- rep(c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", NA), times = c(12563, 11649, 10215, 9783, 7580, 23339, 25935))
categorical_predictor11 <- rep(c("cat1", "cat2", NA), times = c(14037, 61092, 25935))
categorical_predictor12 <- rep(c("cat1", "cat2", "cat3", NA), times = c(15042, 35676, 23861, 26485))
# Outcome variable
outcome_variable <- rep(c("cat1", "cat2", "cat3"), times = c(21375, 49824, 29865))
## Continuous Predictors: Values are not normalized
continuous_predictor1 <- runif(n = 101064, min = 0, max = 90)
continuous_predictor2 <- runif(n = 101064, min = 0, max = 95.4)
continuous_predictor3 <- runif(n = 101064, min = 0, max = 14.1515)
continuous_predictor4 <- runif(n = 101064, min = 0, max = 85)
continuous_predictor5 <- runif(n = 101064, min = 0, max = 71)
continuous_predictor6 <- runif(n = 101064, min = -236, max = 97)
continuous_predictor7 <- runif(n = 101064, min = -40, max = 84)
continuous_predictor8 <- runif(n = 101064, min = 2015, max = 2019)
continuous_predictor9 <- runif(n = 101064, min = 0, max = 6)
continuous_predictor10 <- runif(n = 101064, min = 2, max = 26)
continuous_predictor11 <- runif(n = 101064, min = 0, max = 26)
continuous_predictor12 <- runif(n = 101064, min = 0.1365, max = 0.4352)
continuous_predictor13 <- runif(n = 101064, min = 0.1282, max = 0.4860)
continuous_predictor14 <- runif(n = 101064, min = 0.1232, max = 0.4643)
continuous_predictor15 <- runif(n = 101064, min = 0.1365, max = 0.4885)
continuous_predictor16 <- runif(n = 101064, min = 107, max = 218.6)
continuous_predictor17 <- runif(n = 101064, min = 0.6667, max = 16.333)
continuous_predictor18 <- runif(n = 101064, min = 3.479, max = 7.177)
continuous_predictor19 <- runif(n = 101064, min = 0.8292, max = 3.3100)
continuous_predictor20 <- runif(n = 101064, min = 49.33, max = 101.70)
continuous_predictor21 <- runif(n = 101064, min = 0.07333, max = 0.42534)
continuous_predictor22 <- runif(n = 101064, min = 0.08727, max = 0.41762)
continuous_predictor23 <- runif(n = 101064, min = 0.1241, max = 0.4673)
continuous_predictor24 <- runif(n = 101064, min = 0.07483, max = 0.41192)
continuous_predictor25 <- runif(n = 101064, min = 446.1, max = 561.0)
continuous_predictor26 <- runif(n = 101064, min = 2.333, max = 24)
continuous_predictor27 <- runif(n = 101064, min = 14.52, max = 18.23)
continuous_predictor28 <- runif(n = 101064, min = 0.5463, max = 3.488)
continuous_predictor29 <- runif(n = 101064, min = 150.7, max = 251.9)
continuous_predictor30 <- runif(n = 101064, min = 0.1120, max = 0.4603)
continuous_predictor31 <- runif(n = 101064, min = 0.1231, max = 0.4766)
continuous_predictor32 <- runif(n = 101064, min = 0.1271, max = 0.4857)
continuous_predictor33 <- runif(n = 101064, min = 0.1152, max = 0.4613)
continuous_predictor34 <- runif(n = 101064, min = 238.6, max = 329.4)
continuous_predictor35 <- runif(n = 101064, min = 5.333, max = 19.667)
continuous_predictor36 <- runif(n = 101064, min = 7.815, max = 10.929)
continuous_predictor37 <- runif(n = 101064, min = 0.8323, max = 2.8035)
continuous_predictor38 <- runif(n = 101064, min = 140.9, max = 195.5)
continuous_predictor39 <- runif(n = 101064, min = 0.1098, max = 0.4581)
continuous_predictor40 <- runif(n = 101064, min = 0.08825, max = 0.41360)
continuous_predictor41 <- runif(n = 101064, min = 0.1209, max = 0.4510)
continuous_predictor42 <- runif(n = 101064, min = 0.1048, max = 0.4498)
continuous_predictor43 <- runif(n = 101064, min = 312.2, max = 382.2)
continuous_predictor44 <- runif(n = 101064, min = 2.667, max = 18)
continuous_predictor45 <- runif(n = 101064, min = 10.22, max = 12.49)
continuous_predictor46 <- runif(n = 101064, min = 1.077, max = 2.968)
continuous_predictor47 <- runif(n = 101064, min = 72.18, max = 155.71)
## Continuous Predictors: Values have NAs
continuous_predictor_withNA1 <- c(runif(n = 101064 - 26485, min = 1, max = 3), rep(NA, times = 26485))
continuous_predictor_withNA2 <- c(runif(n = 101064 - 26485, min = 1, max = 3), rep(NA, times = 26485))
## Group ID
set.seed(123)
group_id <- sample(c(1,2,3,4,5,6,7,9,10,11,13,14,16,17,18,19,20,21,22,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,107,109,110,111,112,125,126,161,162,163,164,165,178,179,180,184,185,186,187,188,189,197,198,199,209,210,211,212,213,214,231,232,233,234,239,240,250,251,252,255,256,257,258,259,260,261,508,509,510,602,721,730),
size = 101064,
replace = TRUE,
prob = c(0.010300404,0.003661047,0.005758727,0.002849679,0.005976411,0.006738304,0.004957255,0.008727143,0.007757461,0.00530357,0.00867767,0.003839151,0.007836618,0.004531782,0.007678303,0.013150083,0.003364205,0.005194728,0.002750732,0.005778517,0.009825457,0.010488403,0.009399984,0.006105042,0.011101876,0.006490936,0.008459986,0.003918309,0.009083353,0.001583155,0.005382728,0.013832819,0.004828623,0.004670308,0.007213251,0.006570094,0.006035779,0.007322093,0.006570094,0.002077891,0.000979577,0.006926304,0.007124199,0.005521254,0.007618935,0.00335431,0.002968416,0.005442096,0.016069026,0.005174939,0.001820629,0.008578722,0.00213726,0.00142484,0.014644186,0.006688831,0.003799573,0.008430302,0.004581255,0.002552838,0.012833452,0.00620399,0.003799573,0.004729676,0.005639991,0.010824824,0.010735771,0.004343782,0.008934932,0.005679569,0.004096414,0.011141455,0.011853875,0.00354231,0.006312832,0.001553471,0.009162511,0.006550305,0.007688198,0.002354943,0.002730943,0.005085886,0.004808834,0.013634924,0.006233674,0.007124199,0.007915776,0.006431568,0.003957888,0.005422307,0.002394522,0.00865788,0.008093881,0.002592417,0.001157682,0.005758727,0.004897887,0.002364838,0.004749466,0.005194728,0.009795773,0.007054936,0.003601678,0.006362305,0.00848967,0.011448191,0.003364205,0.006431568,0.005224412,0.007282514,0.007242935,0.008074092,0.009686931,0.00670862,0.003571994,0.008717249,0.007806934,0.004135993,0.006253463,0.006302937,0.007846513,0.003680836,0.006095148,0.00264189,0.004581255,0.004838518,0.001454524,0.004571361,0.005926937,0.002236207,0.007361672,0.006332621,0.011952822,0.013852608,0.009775984,0.007124199,0.013733872,0.007143988,0.006827357,0.00425473,0.007094514,0.005085886,0.013308399,0.007480409,0.007737671,0.004551571,0.00744083,0.012576189,0.008796406,0.010884192,0.0063722,0.01006293))
## Join to make a dataframe
df <- tibble(group_id,
categorical_predictor1,
categorical_predictor2,
categorical_predictor3,
categorical_predictor4,
categorical_predictor5,
categorical_predictor6,
categorical_predictor7,
categorical_predictor8,
categorical_predictor9,
categorical_predictor10,
categorical_predictor11,
categorical_predictor12,
continuous_predictor1,
continuous_predictor2,
continuous_predictor3,
continuous_predictor4,
continuous_predictor5,
continuous_predictor6,
continuous_predictor7,
continuous_predictor8,
continuous_predictor9,
continuous_predictor10,
continuous_predictor11,
continuous_predictor12,
continuous_predictor13,
continuous_predictor14,
continuous_predictor15,
continuous_predictor16,
continuous_predictor17,
continuous_predictor18,
continuous_predictor19,
continuous_predictor20,
continuous_predictor21,
continuous_predictor22,
continuous_predictor23,
continuous_predictor24,
continuous_predictor25,
continuous_predictor26,
continuous_predictor27,
continuous_predictor28,
continuous_predictor29,
continuous_predictor30,
continuous_predictor31,
continuous_predictor32,
continuous_predictor33,
continuous_predictor34,
continuous_predictor35,
continuous_predictor36,
continuous_predictor37,
continuous_predictor38,
continuous_predictor39,
continuous_predictor40,
continuous_predictor41,
continuous_predictor42,
continuous_predictor43,
continuous_predictor44,
continuous_predictor45,
continuous_predictor46,
continuous_predictor47,
continuous_predictor_withNA1,
continuous_predictor_withNA2,
outcome_variable)
df <- df %>%
mutate_if(is.character, as.factor) %>%
mutate(.row = row_number())
# Split Data ----
## Split the data while keeping group ids separate, groups will not be split up across training and testing sets
set.seed(123)
holdout_group_id <- sample(unique(df$group_id), size = 5)
indices <- list(
analysis = df %>% filter(!(group_id %in% holdout_group_id)) %>% pull(.row),
assessment = df %>% filter(group_id %in% holdout_group_id) %>% pull(.row)
)
## Remove row column - no longer required
df <- df %>%
select(-.row)
split <- make_splits(indices, df)
df_train <- training(split)
df_test <- testing(split)
## Create Cross Validation Folds
set.seed(123)
folds <- group_vfold_cv(df_train, group = "group_id", v = 5)
# Create Recipe ----
## Define a recipe to be applied to the data
df_recipe <- recipe(outcome_variable ~ ., data = df_train) %>%
update_role(group_id, new_role = "ID") %>%
step_unknown(all_nominal_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.1, other = "other_category") %>%
step_dummy(all_nominal_predictors()) %>%
step_impute_median(continuous_predictor_withNA1, continuous_predictor_withNA2) %>%
themis::step_downsample(all_outcomes(), skip = TRUE)
# Define Model ----
## Initialise model with tuneable hyperparameters
rf_spec <- rand_forest(trees = tune(), mtry = tune() ) %>%
set_engine("ranger", importance = "permutation") %>%
set_mode("classification")
# Define Workflow to connect Recipe and Model ----
rf_workflow <- workflow() %>%
add_recipe(df_recipe) %>%
add_model(rf_spec)
# Train and Tune Model ----
## Define a random grid for hyperparameters to vary over
set.seed(123)
rf_grid <- grid_latin_hypercube(
trees(),
mtry() %>% finalize(df_train %>% dplyr::select(-group_id, -outcome_variable)),
size = 20)
## Tune Model using Parallel Processing
all_cores <- parallel::detectCores(logical=FALSE) - 1
registerDoFuture() # Register backend
cl <- makeCluster(all_cores, setup_strategy = "sequential")
set.seed(123)
rf_tuned <-rf_workflow %>%
tune_race_win_loss(resamples = folds,
grid = rf_grid,
control = control_race(save_pred = TRUE),
metrics = metric_set(roc_auc, accuracy))