I am having trouble fitting models with missing data, even when applying the step_naomit()
function from {recipes}.
library(reprex)
#> Warning: package 'reprex' was built under R version 3.6.3
library(tidyverse)
library(tidymodels)
#> -- Attaching packages --------------------------------------------------------------- tidymodels 0.1.0 --
#> v broom 0.5.4 v rsample 0.0.5
#> v dials 0.0.4 v tune 0.0.1.9000
#> v infer 0.5.1 v workflows 0.1.0
#> v parsnip 0.0.5 v yardstick 0.0.5
#> v recipes 0.1.9
#> -- Conflicts ------------------------------------------------------------------ tidymodels_conflicts() --
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter() masks stats::filter()
#> x recipes::fixed() masks stringr::fixed()
#> x dplyr::lag() masks stats::lag()
#> x dials::margin() masks ggplot2::margin()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step() masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
set.seed(1234)
mtcars_tb <- mtcars %>%
as_tibble() %>%
mutate(vs = c(sample(vs, 22), rep(NA_integer_, 10)))
set.seed(1234)
cv_fold_mtc <- vfold_cv(mtcars_tb)
lasso_mod <-
linear_reg() %>%
set_engine("glmnet") %>%
set_args(penalty = tune(),
mixture = 1)
rec <- recipe(
mpg ~ disp + vs,
data = mtcars_tb
) %>%
step_naomit(everything()) %>%
step_dummy(all_nominal()) %>%
step_normalize(all_numeric())
tune_grid(
lasso_mod,
rec,
resamples = cv_fold_mtc,
control = tune::control_resamples(verbose = TRUE,
save_pred = TRUE)
)
#> i Fold01: recipe
#> v Fold01: recipe
#> i Fold01: model 1/1
#> v Fold01: model 1/1
#> i Fold01: model 1/1 (predictions)
#> x Fold01: model 1/1 (predictions): Error: Column `.row` must be length 2 (the numb...
#> i Fold02: recipe
#> v Fold02: recipe
#> i Fold02: model 1/1
#> v Fold02: model 1/1
#> i Fold02: model 1/1 (predictions)
#> i Fold03: recipe
#> v Fold03: recipe
#> i Fold03: model 1/1
#> v Fold03: model 1/1
#> i Fold03: model 1/1 (predictions)
#> x Fold03: model 1/1 (predictions): Error: Column `.row` must be length 1 (the numb...
#> i Fold04: recipe
#> v Fold04: recipe
#> i Fold04: model 1/1
#> v Fold04: model 1/1
#> i Fold04: model 1/1 (predictions)
#> x Fold04: model 1/1 (predictions): Error: Column `.row` must be length 2 (the numb...
#> i Fold05: recipe
#> v Fold05: recipe
#> i Fold05: model 1/1
#> v Fold05: model 1/1
#> i Fold05: model 1/1 (predictions)
#> x Fold05: model 1/1 (predictions): Error: Column `.row` must be length 2 (the numb...
#> i Fold06: recipe
#> v Fold06: recipe
#> i Fold06: model 1/1
#> v Fold06: model 1/1
#> i Fold06: model 1/1 (predictions)
#> i Fold07: recipe
#> v Fold07: recipe
#> i Fold07: model 1/1
#> v Fold07: model 1/1
#> i Fold07: model 1/1 (predictions)
#> i Fold08: recipe
#> v Fold08: recipe
#> i Fold08: model 1/1
#> v Fold08: model 1/1
#> i Fold08: model 1/1 (predictions)
#> x Fold08: model 1/1 (predictions): Error: Column `.row` must be length 2 (the numb...
#> i Fold09: recipe
#> v Fold09: recipe
#> i Fold09: model 1/1
#> v Fold09: model 1/1
#> i Fold09: model 1/1 (predictions)
#> x Fold09: model 1/1 (predictions): Error: Column `.row` must be length 2 (the numb...
#> i Fold10: recipe
#> v Fold10: recipe
#> i Fold10: model 1/1
#> v Fold10: model 1/1
#> i Fold10: model 1/1 (predictions)
#> x Fold10: model 1/1 (predictions): Error: Column `.row` must be length 1 (the numb...
#> # 10-fold cross-validation
#> # A tibble: 10 x 5
#> splits id .metrics .notes .predictions
#> * <list> <chr> <list> <list> <list>
#> 1 <split [28/4]> Fold01 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 2 <split [28/4]> Fold02 <tibble [20 x 4]> <tibble [0 x 1]> <tibble [40 x 4]>
#> 3 <split [29/3]> Fold03 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 4 <split [29/3]> Fold04 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 5 <split [29/3]> Fold05 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 6 <split [29/3]> Fold06 <tibble [20 x 4]> <tibble [0 x 1]> <tibble [30 x 4]>
#> 7 <split [29/3]> Fold07 <tibble [20 x 4]> <tibble [0 x 1]> <tibble [30 x 4]>
#> 8 <split [29/3]> Fold08 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 9 <split [29/3]> Fold09 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
#> 10 <split [29/3]> Fold10 <tibble [0 x 4]> <tibble [1 x 1]> <tibble [0 x 4]>
When I check the juiced recipe, it looks like all missing values are gone, so I am not sure the issue.
recipe(
mpg ~ disp + vs,
data = mtcars_tb
) %>%
step_naomit(everything()) %>%
step_dummy(all_nominal()) %>%
step_normalize(all_numeric()) %>%
prep() %>%
juice() %>%
print(n = Inf)
#> # A tibble: 22 x 3
#> disp vs mpg
#> <dbl> <dbl> <dbl>
#> 1 -0.574 1.29 0.163
#> 2 -0.574 -0.739 0.163
#> 3 -0.986 1.29 0.452
#> 4 0.201 -0.739 0.228
#> 5 1.01 -0.739 -0.206
#> 6 -0.0600 -0.739 -0.302
#> 7 1.01 -0.739 -0.912
#> 8 -0.679 1.29 0.709
#> 9 -0.726 1.29 0.452
#> 10 -0.514 1.29 -0.125
#> 11 -0.514 -0.739 -0.350
#> 12 0.342 1.29 -0.575
#> 13 0.342 -0.739 -0.430
#> 14 0.342 -0.739 -0.767
#> 15 1.89 -0.739 -1.54
#> 16 1.80 -0.739 -1.54
#> 17 1.64 -0.739 -0.848
#> 18 -1.22 1.29 1.99
#> 19 -1.24 -0.739 1.67
#> 20 -1.28 1.29 2.23
#> 21 -0.890 -0.739 0.244
#> 22 0.676 -0.739 -0.719
Created on 2020-03-05 by the reprex package (v0.3.0)