Tidymodels: Tuning hyperparameters in a bagged tree using the tune_grid() function

Overview

I have produced four models using the tidymodels package with the data frame FID (see below):

  1. General Linear Model
  2. Bagged Tree
  3. Random Forest
  4. Boosted Trees

The data frame FID contains three predictors:

  1. Year (numeric)
  2. Month (Factor)
  3. Days (numeric)

The dependent variable is Frequency (numeric)

I am attempting to tune my a bagged tree model produced by using the function bag_tree() in the baguette package, and I am experiencing issues using the tune_grid() function.

If anyone can help, I would be deeply appreciative.

Many thanks.

Error Message

   #Error messages
            ! Fold02: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold07: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold08: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold10: internal: A correlation computation is required, but `estimate` is constant and has 0 sta...

            Warning message:
            This tuning result has notes. Example notes on model fitting include:
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned. 

R-Code

library(baguette) 
library(tidymodels)

 seed(45L)

 #split this single dataset into two: a training set and a testing set
 data_split <- initial_split(FID)

##Create data frames for the two sets:
 train_data <- training(data_split)
 test_data  <- testing(data_split)

##resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)

 ###########################################################
  ##Produce the recipe

 rec <- recipe(Frequency ~ ., data = FID) %>% 
      step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
      step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
      step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
      step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

   #########################################################
   #########################################################
    ##Bagged Tree Model 
   #########################################################
   #########################################################

  ##Produice the model
    mod_bag <- bag_tree() %>%
        set_mode("regression") %>%
          set_engine("rpart", times = 10) #10 bootstrap resamples
            
   ##Create workflow
        wflow_bag <- workflow() %>% 
                  add_recipe(rec) %>%
                     add_model(mod_bag)

    ##Fit the model
     plan(multisession)

    fit_bag <- fit_resamples(
                           wflow_bag,
                           cv,
                           metrics = metric_set(rmse, rsq),
                           control = control_resamples(save_pred = TRUE,
                           extract = function(x) extract_model(x)))

          ##Collect the metrics for the bagged trees
          fit_bag %>% collect_metrics()

          ##Collect model predictions for each fold for the number of blue whale sightings

          bag_predictions<-fit_bag %>% collect_predictions()                                

     #######Tuning hyperparameters
      
      ##Estimating the best value model by estimating the best value by 
      ##training many models on resamples data sets
      ##and exploring how well these models perform
      
      tune_spec_bag <- 
            bag_tree(tree_depth = tune()) %>%
                         set_mode("regression") %>%
                                set_engine("rpart", times = 10)
      
      #Create a regular grid of values to try using a convenience function 
      bag_grid <- grid_regular(
                              tree_depth(),
                               levels = 10
                                )

      #Create the workflow for the tuned bagged model 
      bag_wf <- workflow() %>%
                  add_formula(Frequency  ~ .) %>%
                                      add_model(tune_spec_bag)
      
    #Tune the bagged tree model
      bag_res <- tune_grid(
                         wflow_bag %>% update_model(tune_spec_bag),
                         cv,
                         grid = bag_grid,
                         metrics=metric_set(rmse, rsq)
                         control = control_resamples(save_pred = TRUE)
                         )

         #Error messages
            ! Fold02: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold07: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold08: internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide b...
            ! Fold10: internal: A correlation computation is required, but `estimate` is constant and has 0 sta...

            Warning message:
            This tuning result has notes. Example notes on model fitting include:
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
            internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned. 

Data Frame - FID

structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", 
"April", "May", "June", "July", "August", "September", "October", 
"November", "December"), class = "factor"), Frequency = c(36, 
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9, 
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27, 
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15, 
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31, 
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA, 
-36L), class = "data.frame")

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.