I am using tidymodels
to create a Random Forrest prediction on the famous Titanic dataset from Kaggle:
The tune results show that there is missing data in the Parch
columns. I think it is because not all factors are present in the tuning data after the vfold_cv
split but I am not sure how to solve it.
I followed the below steps:
After loading the data and processing it using a recipe
there is no missing data:
library(tidyverse)
library(tidymodels)
data <- read.csv('train.csv')
glimpse(data)
---------------------------------------------------------------------------------------------------
Rows: 891
Columns: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 3~
$ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0~
$ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 3, 1, 1, 3, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3~
$ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Briggs Thayer)", "Heikkinen, Miss. Laina", "Futrelle, Mrs. Jacques Heat~
$ Sex <chr> "male", "female", "female", "female", "male", "male", "male", "male", "female", "female", "female", "female", "male", "male", "female", ~
$ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2, NA, 31, NA, 35, 34, 15, 28, 8, 38, NA, 19, NA, NA, 40, NA, NA, 66, 28, ~
$ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 3, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0~
$ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0~
$ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "330877", "17463", "349909", "347742", "237736", "PP 9549", "113783", "~
$ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, 11.1333, 30.0708, 16.7000, 26.5500, 8.0500, 31.2750, 7.8542, 16.0000~
$ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C103", "", "", "", "", "", "", "", "", "", "D56", "", "A6", "", "", "", "C23 C2~
$ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S", "S", "S", "S", "S", "Q", "S", "S", "C", "S", "S", "Q", "S", "S", "S", "C", "~
-------------------------------------------------------------------------------------
#Add missing values for Embarked
data <- data %>% mutate(
Embarked = case_when(
Ticket == 113572 ~"S",
TRUE ~Embarked
)
)
#Split data
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
data_folds <- vfold_cv(data_train, strata = Survived)
#Recipe
data_recipe <-
recipe(Survived ~Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=data_train) %>%
step_mutate(Pclass = factor(Pclass),
SibSp = factor(SibSp),
Parch = factor(Parch),
Embarked = factor(Embarked),
Survived = factor(Survived)) %>%
step_impute_bag(Age,
impute_with = imp_vars(all_predictors()),
options = list(nbagg = 5, keepX = FALSE))
#Prep Recipe
data_recipe_prepped <-
data_recipe %>%
prep(verbose=TRUE)
data_recipe_prepped
#Check prep data
juice(data_recipe_prepped) %>%
head()
---------------------------------------------------------------------------
# A tibble: 6 x 8
Pclass Sex Age SibSp Parch Fare Embarked Survived
<fct> <fct> <dbl> <fct> <fct> <dbl> <fct> <fct>
1 1 male 56 0 0 35.5 C 1
2 1 female 25 1 2 152. S 0
3 3 male 11 5 2 46.9 S 0
4 1 female 24 0 0 83.2 C 1
5 2 female 7 0 2 26.2 S 1
6 3 male 8.17 3 1 25.5 S 0
juice(data_recipe_prepped) %>%
summarise_all(~sum(is.na(.x))) %>% head()
# A tibble: 1 x 8
Pclass Sex Age SibSp Parch Fare Embarked Survived
<int> <int> <int> <int> <int> <int> <int> <int>
1 0 0 0 0 0 0 0 0
----------------------------------------------------------------------------
I set up the model, workflow and tune()
the parameters:
#Workflow
rf_workflow <-
workflow() %>%
add_recipe(data_recipe) %>%
add_model(rf_model)
#Tune parameters
#Make a grid of tuning values
rf_grid <- grid_regular(
mtry(range = c(1,2)),
min_n(range = c(40, 50)),
levels = 2
)
#rf_grid
#Test the grid values
rf_tune_results <-
rf_workflow %>%
tune_grid(
resamples = data_folds,
grid = rf_grid,
metrics = metric_set(accuracy, roc_auc)
)
#Show tune notes
rf_tune_results$.notes
---------------------------------------------------------------------------------------
# A tibble: 4 x 1
.notes
<chr>
1 preprocessor 1/1, model 1/4 (predictions): Error: Missing data in columns: Parch.
2 preprocessor 1/1, model 2/4 (predictions): Error: Missing data in columns: Parch.
3 preprocessor 1/1, model 3/4 (predictions): Error: Missing data in columns: Parch.
4 preprocessor 1/1, model 4/4 (predictions): Error: Missing data in columns: Parch.
-----------------------------------------------------------------------------------------