Tidymodels missing data error when tuning

noveld · June 22, 2021, 3:21pm

I am using tidymodels to create a Random Forrest prediction on the famous Titanic dataset from Kaggle:

The tune results show that there is missing data in the Parch columns. I think it is because not all factors are present in the tuning data after the vfold_cv split but I am not sure how to solve it.

I followed the below steps:

After loading the data and processing it using a recipe there is no missing data:

library(tidyverse) 
library(tidymodels)

data <- read.csv('train.csv')
glimpse(data)
---------------------------------------------------------------------------------------------------
Rows: 891
Columns: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 3~
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0~
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 3, 1, 1, 3, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3~
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Briggs Thayer)", "Heikkinen, Miss. Laina", "Futrelle, Mrs. Jacques Heat~
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "male", "male", "female", "female", "female", "female", "male", "male", "female", ~
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2, NA, 31, NA, 35, 34, 15, 28, 8, 38, NA, 19, NA, NA, 40, NA, NA, 66, 28, ~
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 3, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0~
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0~
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "330877", "17463", "349909", "347742", "237736", "PP 9549", "113783", "~
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, 11.1333, 30.0708, 16.7000, 26.5500, 8.0500, 31.2750, 7.8542, 16.0000~
$ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C103", "", "", "", "", "", "", "", "", "", "D56", "", "A6", "", "", "", "C23 C2~
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S", "S", "S", "S", "S", "Q", "S", "S", "C", "S", "S", "Q", "S", "S", "S", "C", "~
-------------------------------------------------------------------------------------

#Add missing values for Embarked
data <- data %>% mutate(
  Embarked = case_when(
    Ticket == 113572 ~"S",
    TRUE ~Embarked
  )
)

#Split data 
data_split <- initial_split(data)
data_train <- training(data_split)  
data_test <- testing(data_split) 
data_folds <- vfold_cv(data_train, strata = Survived)

#Recipe
data_recipe <- 
  recipe(Survived ~Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=data_train) %>%
   step_mutate(Pclass = factor(Pclass),
               SibSp = factor(SibSp),
               Parch = factor(Parch),
               Embarked = factor(Embarked),
               Survived = factor(Survived)) %>%
  step_impute_bag(Age, 
                  impute_with = imp_vars(all_predictors()),
                  options = list(nbagg = 5, keepX = FALSE)) 

#Prep Recipe
data_recipe_prepped <-
  data_recipe %>% 
  prep(verbose=TRUE)
data_recipe_prepped
  
#Check prep data
juice(data_recipe_prepped) %>%
  head()
---------------------------------------------------------------------------
# A tibble: 6 x 8
  Pclass Sex      Age SibSp Parch  Fare Embarked Survived
  <fct>  <fct>  <dbl> <fct> <fct> <dbl> <fct>    <fct>   
1 1      male   56    0     0      35.5 C        1       
2 1      female 25    1     2     152.  S        0       
3 3      male   11    5     2      46.9 S        0       
4 1      female 24    0     0      83.2 C        1       
5 2      female  7    0     2      26.2 S        1       
6 3      male    8.17 3     1      25.5 S        0       

juice(data_recipe_prepped) %>%
  summarise_all(~sum(is.na(.x))) %>% head()
# A tibble: 1 x 8
  Pclass   Sex   Age SibSp Parch  Fare Embarked Survived
   <int> <int> <int> <int> <int> <int>    <int>    <int>
1      0     0     0     0     0     0        0        0
----------------------------------------------------------------------------

I set up the model, workflow and tune() the parameters:

#Workflow
rf_workflow <- 
  workflow() %>%
  add_recipe(data_recipe) %>%
  add_model(rf_model)

#Tune parameters
#Make a grid of tuning values 
rf_grid <- grid_regular(
  mtry(range = c(1,2)),
  min_n(range = c(40, 50)),
  levels = 2
)
#rf_grid
#Test the grid values
rf_tune_results <-
  rf_workflow %>%
  tune_grid(
    resamples = data_folds,
    grid = rf_grid, 
    metrics = metric_set(accuracy, roc_auc)
  )
#Show tune notes
rf_tune_results$.notes
---------------------------------------------------------------------------------------
# A tibble: 4 x 1
  .notes                                                                           
  <chr>                                                                            
1 preprocessor 1/1, model 1/4 (predictions): Error: Missing data in columns: Parch.
2 preprocessor 1/1, model 2/4 (predictions): Error: Missing data in columns: Parch.
3 preprocessor 1/1, model 3/4 (predictions): Error: Missing data in columns: Parch.
4 preprocessor 1/1, model 4/4 (predictions): Error: Missing data in columns: Parch.
-----------------------------------------------------------------------------------------

system · July 13, 2021, 3:21pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.