Tidymodels using tune_race_anova with a workflow

Hi,

I have been looking at the finetune package to try out the tune_race_anova based on the presentation given at rstudio::global 2021.

I am trying to get it to work with a work flow but am confusing myself.
Below is some pseudocode.
I'm getting stuck on the portion around tune_res_rf and where to actually integrate the grid
Does anyone have any ideas?

set.seed(4595)
data_split <- initial_split(explore_data, strata = "tgt", prop = 0.75)

train_explore <- training(data_split)
test_explore  <- testing(data_split)
  
# Generate resamples and repeat
report_resamples <- vfold_cv(train_explore, v = 10, repeats = 1, strata = tgt)

# Set up the model definition
preprocess <- train_explore %>%
  recipe(tgt ~ .) %>%
  themis::step_downsample(tgt) 

# BUILD A RANDOM FOREST MODEL ---------------------------------------------
rf_mod <- rand_forest(
  mtry = tune(),
  trees = tune(),
  min_n = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger")

rf_grid <- dials::parameters(
  finalize(mtry(), select(explore_data, -tgt)),
  trees(),
  min_n())

tune_wf <- workflow() %>%
  add_recipe(preprocess) %>%
  add_model(rf_mod)

# Tune the models
library(doParallel) 
library(finetune)
no_cores <- detectCores() - 1  
registerDoParallel(cores=no_cores)  

# This is the bit where i get stuck
set.seed(345)
tune_res_rf <- tune_race_anova(tune_wf,
                         resamples = report_resamples,
                         grid = rf_grid,
                         perf = metric_set(roc_auc, sens, spec, kap, accuracy)
)

doParallel::stopImplicitCluster()

Thank you for your time

I don't see any issues with that code (although I can't reproduce the results). Does an error occur?

That's very odd. We'll need a small, reproducible example to test with. Can you substitute another data set to get the error?

Sure, Please see below. There is a factor warning but this isn't in my original model.
It takes under 2 minutes to run

library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 3.6.3
#> Warning: package 'ggplot2' was built under R version 3.6.3
#> Warning: package 'tibble' was built under R version 3.6.3
#> Warning: package 'tidyr' was built under R version 3.6.3
#> Warning: package 'readr' was built under R version 3.6.2
#> Warning: package 'purrr' was built under R version 3.6.3
#> Warning: package 'dplyr' was built under R version 3.6.3
#> Warning: package 'stringr' was built under R version 3.6.2
#> Warning: package 'forcats' was built under R version 3.6.2
library(tidymodels)
#> Warning: package 'tidymodels' was built under R version 3.6.3
#> -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
#> v broom     0.7.2      v recipes   0.1.15
#> v dials     0.0.9      v rsample   0.0.8 
#> v infer     0.5.3      v tune      0.1.2 
#> v modeldata 0.1.0      v workflows 0.2.1 
#> v parsnip   0.1.4      v yardstick 0.0.7
#> Warning: package 'broom' was built under R version 3.6.3
#> Warning: package 'dials' was built under R version 3.6.3
#> Warning: package 'scales' was built under R version 3.6.2
#> Warning: package 'infer' was built under R version 3.6.3
#> Warning: package 'modeldata' was built under R version 3.6.3
#> Warning: package 'parsnip' was built under R version 3.6.3
#> Warning: package 'recipes' was built under R version 3.6.3
#> Warning: package 'rsample' was built under R version 3.6.3
#> Warning: package 'tune' was built under R version 3.6.3
#> Warning: package 'workflows' was built under R version 3.6.3
#> Warning: package 'yardstick' was built under R version 3.6.3
#> -- Conflicts ----------------------------------------- tidymodels_conflicts() --
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter()   masks stats::filter()
#> x recipes::fixed()  masks stringr::fixed()
#> x dplyr::lag()      masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step()   masks stats::step()
library(finetune)
#> Warning: package 'finetune' was built under R version 3.6.3
library(doParallel) 
#> Warning: package 'doParallel' was built under R version 3.6.3
#> Loading required package: foreach
#> Warning: package 'foreach' was built under R version 3.6.2
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Warning: package 'iterators' was built under R version 3.6.2
#> Loading required package: parallel
library(modeldata)

set.seed(4595)
data("credit_data")
data_split <- initial_split(credit_data, strata = "Status", prop = 0.75)

train_explore <- training(data_split)
test_explore  <- testing(data_split)

# Generate resamples and repeat
report_resamples <- vfold_cv(train_explore, v = 10, repeats = 1, strata = Status)

# Set up the model definition
preprocess <- train_explore %>%
  recipe(Status ~ .) %>%
  themis::step_downsample(Status) %>% 
  step_dummy(all_nominal())
#> Registered S3 methods overwritten by 'themis':
#>   method                  from   
#>   bake.step_downsample    recipes
#>   bake.step_upsample      recipes
#>   prep.step_downsample    recipes
#>   prep.step_upsample      recipes
#>   tidy.step_downsample    recipes
#>   tidy.step_upsample      recipes
#>   tunable.step_downsample recipes
#>   tunable.step_upsample   recipes

# BUILD A RANDOM FOREST MODEL ---------------------------------------------
rf_mod <- rand_forest(
  mtry = tune(),
  trees = tune(),
  min_n = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger")

rf_grid <- dials::parameters(
  finalize(mtry(), select(credit_data, -Status)),
  trees(),
  min_n())

tune_wf <- workflow() %>%
  add_recipe(preprocess) %>%
  add_model(rf_mod)

no_cores <- detectCores() - 1  
registerDoParallel(cores=no_cores)  

set.seed(345)
tune_res_rf <- tune_race_anova(tune_wf,
                               resamples = report_resamples,
                               grid = rf_grid,
                               perf = metric_set(roc_auc, sens, kap, accuracy)
)
#> Warning: The `...` are not used in this function but one or more objects were
#> passed: 'perf'
#> i Creating pre-processing data to finalize unknown parameter: mtry
#> Warning: There are new levels in a factor: NA
#> Warning: There are new levels in a factor: NA
#> Error: The provided `grid` has the following parameter columns that have not been marked for tuning by `tune()`: 'name', 'id', 'source', 'component', 'component_id', 'object'.

doParallel::stopImplicitCluster()

Created on 2021-02-03 by the reprex package (v0.3.0)

There were three problems:

  • There isn't a perf argument; I think you meant metrics. I didn't see that either when I looked at your code.
  • In the recipe, step_dummy(all_nominal()) was capturing the outcome. This happens a lot and the devel version of recipes has all_nominal_predictors(). Until then, use step_dummy(all_nominal(), -Status). However, the ranger package does not require dummy variables for predictors, so you can skip that if you want.
  • The grid code returns the parameters. You could pass this to the param_info argument or make the grid with one of the grid functions, such as
rf_grid <- dials::parameters(
   finalize(mtry(), select(credit_data, -Status)),
   trees(),
   min_n()) %>% 
   grid_random(5)

One other thing... this data set has some missing values so you might want to add one of the imputation steps to the recipe (otherwise ranger will error).

Here's my script:

library(tidyverse)
library(tidymodels)
library(finetune)

set.seed(4595)
data("credit_data")

credit_data <- credit_data %>% na.omit()

data_split <- initial_split(credit_data, strata = "Status", prop = 0.75)

train_explore <- training(data_split)
test_explore  <- testing(data_split)

# Generate resamples and repeat
report_resamples <- vfold_cv(train_explore, v = 10, repeats = 1, strata = Status)

# Set up the model definition
preprocess <- train_explore %>%
   recipe(Status ~ .) %>%
   themis::step_downsample(Status) 

# BUILD A RANDOM FOREST MODEL ---------------------------------------------
rf_mod <- rand_forest(
   mtry = tune(),
   trees = tune(),
   min_n = tune()) %>%
   set_mode("classification") %>%
   set_engine("ranger")

rf_grid <- dials::parameters(
   finalize(mtry(), select(credit_data, -Status)),
   trees(),
   min_n()) %>% 
   grid_random(5)

tune_wf <- workflow() %>%
   add_recipe(preprocess) %>%
   add_model(rf_mod)

set.seed(345)
tune_res_rf <- tune_race_anova(tune_wf,
                               resamples = report_resamples,
                               grid = rf_grid,
                               metrics = metric_set(roc_auc, sens, kap, accuracy)
)
1 Like

Thanks very much @Max
That did the trick

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.

Hi @Max

When i run the portion

tune_res_rf <- tune_race_anova(tune_wf,
                         resamples = report_resamples,
                         grid = rf_grid,
                         perf = metric_set(roc_auc, sens, spec, kap, accuracy)
)

I get the error

The provided grid has the following parameter columns that have not been marked for tuning by tune(): 'name', 'id', 'source', 'component', 'component_id', 'object'.

One thing i can see is i have not set the grid size anywhere but i am not sure where to set it either :slight_smile: