I am new to machine learning, modelling, etc. and this is my first post. Apologies in advance for any newbie errors.
I was playing around with the Kaggle Titanic dataset to practice fitting models using tidymodels. The code is attached below. I would add the csv data files but cant figure out if I can/how to do it with this interface.
The problem: the test file (the one that would be used to submit the results to Kaggle that I called titanic_predict in the script) contains 418 records. When I run the script from start to end, the Xgboost workflow produces the expected 418 predictions. KNN, on the other hand, returns 417.
I tried everything that I can think of (including a PC restart ) to figure out what is happening and why and am at a complete loss. Does anyone have any thoughts or ideas? I am going a bit crazy trying to figure this out.
Thank you in advance.
library(tidyverse) library(tidymodels) titanic <- read_csv("data/Titanic/train.csv") titanic_predict <- read_csv("data/Titanic/test.csv") #Split Data set.seed(123) titanic_split <- initial_split(titanic, strata = Survived) titanic_train <- training(titanic_split) titanic_test <- testing(titanic_split) set.seed(234) titanic_folds <- vfold_cv(titanic_train, strata = Survived) #Pre Process titanic_rec <- recipe(Survived ~ ., data = titanic_train) %>% step_rm(PassengerId) %>% step_knnimpute(Embarked, impute_with = imp_vars(all_numeric())) %>% step_mutate(Survived = factor(Survived), skip = TRUE) %>% step_knnimpute(Age, neighbors = tune(id = "recipe_neighbors"), impute_with = imp_vars(all_numeric())) %>% step_mutate( Cabin = case_when( is.na(Cabin) ~ "No Cabin", TRUE ~ "Cabin" ) ) %>% step_mutate(Cabin = factor(Cabin)) %>% step_rm(Name) %>% step_mutate(Ticket = str_length(Ticket)) %>% step_mutate(Ticket = factor(Ticket)) %>% step_other(Ticket, threshold = tune()) %>% step_mutate(Child = if_else(Age < 13, "Child", "Not Child")) %>% step_mutate(Child = factor(Child)) %>% step_rm(SibSp, Parch) %>% step_mutate(Pclass = factor(Pclass)) %>% step_log(all_numeric(), offset = 1) %>% step_normalize(all_numeric()) %>% step_dummy(all_predictors(), -Age, -Fare) #KNN Workflow knn_model <- nearest_neighbor() %>% set_engine("kknn") %>% set_mode("classification") %>% set_args(neighbors = tune(), weight_func = tune(), dist_power = tune()) knn_wf <- workflow() %>% add_model(knn_model) %>% add_recipe(titanic_rec) knn_tune_params <- parameters(knn_wf) knn_grid <- knn_tune_params %>% grid_max_entropy(size = 15) knn_res <- knn_wf %>% tune_grid(resamples = titanic_folds, grid = knn_grid, control = control_grid( save_pred = TRUE, verbose = TRUE )) knn_final_params <- knn_res %>% select_best("accuracy") knn_final_wf <- knn_wf %>% finalize_workflow(knn_final_params) knn_final_model <- knn_final_wf %>% last_fit(titanic_split) knn_final_fit <- knn_final_wf %>% fit(titanic) knn_predict <- knn_final_fit %>% predict(titanic_predict) #Xgboost Workflow xgb_model <- boost_tree() %>% set_engine("xgboost") %>% set_mode("classification") %>% set_args(tree_depth = tune(), trees = 500, mtry = tune(), min_n = tune(), loss_reduction = tune(), sample_size = tune()) xgb_wf <- workflow() %>% add_model(xgb_model) %>% add_recipe(titanic_rec) xgb_tune_params <- parameters(xgb_wf) %>% update(mtry = finalize(mtry(), titanic_train)) xgb_grid <- xgb_tune_params %>% grid_max_entropy(size = 15) xgb_res <- xgb_wf %>% tune_grid(resamples = titanic_folds, grid = xgb_grid, control = control_grid( save_pred = TRUE, verbose = TRUE )) xgb_final_params <- xgb_res %>% select_best("accuracy") xgb_final_wf <- xgb_wf %>% finalize_workflow(xgb_final_params) xgb_final_model <- xgb_final_wf %>% last_fit(titanic_split) xgb_final_fit <- xgb_final_wf %>% fit(titanic) xgb_predict <- xgb_final_fit %>% predict(titanic_predict)