I am new to machine learning, modelling, etc. and this is my first post. Apologies in advance for any newbie errors.
I was playing around with the Kaggle Titanic dataset to practice fitting models using tidymodels. The code is attached below. I would add the csv data files but cant figure out if I can/how to do it with this interface.
The problem: the test file (the one that would be used to submit the results to Kaggle that I called titanic_predict in the script) contains 418 records. When I run the script from start to end, the Xgboost workflow produces the expected 418 predictions. KNN, on the other hand, returns 417.
I tried everything that I can think of (including a PC restart ) to figure out what is happening and why and am at a complete loss. Does anyone have any thoughts or ideas? I am going a bit crazy trying to figure this out.
Thank you in advance.
library(tidyverse)
library(tidymodels)
titanic <- read_csv("data/Titanic/train.csv")
titanic_predict <- read_csv("data/Titanic/test.csv")
#Split Data
set.seed(123)
titanic_split <- initial_split(titanic, strata = Survived)
titanic_train <- training(titanic_split)
titanic_test <- testing(titanic_split)
set.seed(234)
titanic_folds <- vfold_cv(titanic_train, strata = Survived)
#Pre Process
titanic_rec <-
recipe(Survived ~ ., data = titanic_train) %>%
step_rm(PassengerId) %>%
step_knnimpute(Embarked,
impute_with = imp_vars(all_numeric())) %>%
step_mutate(Survived = factor(Survived), skip = TRUE) %>%
step_knnimpute(Age,
neighbors = tune(id = "recipe_neighbors"),
impute_with = imp_vars(all_numeric())) %>%
step_mutate(
Cabin = case_when(
is.na(Cabin) ~ "No Cabin",
TRUE ~ "Cabin"
)
) %>%
step_mutate(Cabin = factor(Cabin)) %>%
step_rm(Name) %>%
step_mutate(Ticket = str_length(Ticket)) %>%
step_mutate(Ticket = factor(Ticket)) %>%
step_other(Ticket, threshold = tune()) %>%
step_mutate(Child = if_else(Age < 13, "Child", "Not Child")) %>%
step_mutate(Child = factor(Child)) %>%
step_rm(SibSp, Parch) %>%
step_mutate(Pclass = factor(Pclass)) %>%
step_log(all_numeric(), offset = 1) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_predictors(), -Age, -Fare)
#KNN Workflow
knn_model <-
nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("classification") %>%
set_args(neighbors = tune(),
weight_func = tune(),
dist_power = tune())
knn_wf <-
workflow() %>%
add_model(knn_model) %>%
add_recipe(titanic_rec)
knn_tune_params <-
parameters(knn_wf)
knn_grid <-
knn_tune_params %>%
grid_max_entropy(size = 15)
knn_res <-
knn_wf %>%
tune_grid(resamples = titanic_folds,
grid = knn_grid,
control = control_grid(
save_pred = TRUE,
verbose = TRUE
))
knn_final_params <-
knn_res %>%
select_best("accuracy")
knn_final_wf <-
knn_wf %>%
finalize_workflow(knn_final_params)
knn_final_model <-
knn_final_wf %>%
last_fit(titanic_split)
knn_final_fit <-
knn_final_wf %>%
fit(titanic)
knn_predict <-
knn_final_fit %>%
predict(titanic_predict)
#Xgboost Workflow
xgb_model <-
boost_tree() %>%
set_engine("xgboost") %>%
set_mode("classification") %>%
set_args(tree_depth = tune(),
trees = 500,
mtry = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune())
xgb_wf <-
workflow() %>%
add_model(xgb_model) %>%
add_recipe(titanic_rec)
xgb_tune_params <-
parameters(xgb_wf) %>%
update(mtry = finalize(mtry(), titanic_train))
xgb_grid <-
xgb_tune_params %>%
grid_max_entropy(size = 15)
xgb_res <-
xgb_wf %>%
tune_grid(resamples = titanic_folds,
grid = xgb_grid,
control = control_grid(
save_pred = TRUE,
verbose = TRUE
))
xgb_final_params <-
xgb_res %>%
select_best("accuracy")
xgb_final_wf <-
xgb_wf %>%
finalize_workflow(xgb_final_params)
xgb_final_model <-
xgb_final_wf %>%
last_fit(titanic_split)
xgb_final_fit <-
xgb_final_wf %>%
fit(titanic)
xgb_predict <-
xgb_final_fit %>%
predict(titanic_predict)