I had to tweak the recipe a bit:
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
set.seed(2624)
percent <- 0.80
mtcars <-
mtcars %>%
mutate(am = as.factor(am))
in_train <- createDataPartition(mtcars$am, p = percent, list = FALSE)
train_data <- mtcars[in_train,]
test_data <- mtcars[-in_train,]
log_recipe <-
recipe(formula = am ~ ., data = train_data) %>%
# Nothing selected here and fails
# step_other(all_nominal(), -all_outcomes(), threshold = 0.02, other = "other_assigned ") %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_pca(all_numeric(), -all_outcomes(), num_comp = nrow(train_data)) %>%
# No variables to make dummies
# step_dummy(all_nominal(), -all_outcomes()) %>%
# step_nzv(all_predictors()) %>%
# step_pca() ensures that step_corr() won't select anything
# step_corr(all_numeric()) %>%
# step_lincomb(all_numeric()) %>%
step_naomit(all_predictors()) %>%
# Do the conversion here (skip means it won't fail when predicting)
step_mutate(am = as.factor(am), skip = TRUE)
train_prepped <-
log_recipe %>%
prep(train_data) %>%
juice()
set.seed(2624)
log.glmRFE <- lrFuncs
log.glmRFE$summary <- twoClassSummary
log_ctrl <- rfeControl(functions = log.glmRFE,
method = "repeatedcv",
number = 10,
repeats = 5,
saveDetails = TRUE,
verbose= FALSE)
log_model <-
rfe(x = train_prepped %>% dplyr::select(-am),
y = train_prepped$am,
sizes = 1:10,
rfeControl = log_ctrl)
#> Warning in rfe.default(x = train_prepped %>% dplyr::select(-am), y =
#> train_prepped$am, : Metric 'Accuracy' is not created by the summary
#> function; 'ROC' will be used instead
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# <snip>
log_model
#>
#> Recursive feature selection
#>
#> Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
#>
#> Resampling performance over subset size:
#>
#> Variables ROC Sens Spec ROCSD SensSD SpecSD Selected
#> 1 0.8150 0.74 0.50 0.33047 0.3676 0.49487
#> 2 0.9650 0.97 0.94 0.17504 0.1568 0.23990
#> 3 0.9650 0.97 0.96 0.17504 0.1568 0.19795
#> 4 0.9650 0.92 0.96 0.17504 0.2548 0.19795
#> 5 0.9650 0.90 0.97 0.15980 0.2673 0.15682
#> 6 0.9625 0.87 0.97 0.15205 0.2998 0.15682
#> 7 0.9900 0.90 0.99 0.07071 0.2673 0.07071 *
#> 8 0.9800 0.84 0.98 0.11112 0.3264 0.14142
#> 9 0.9700 0.79 0.97 0.14846 0.3655 0.15682
#> 10 0.9700 0.79 0.97 0.14846 0.3655 0.15682
#>
#> The top 5 variables (out of 7):
#> PC02, PC01, PC03, PC10, PC07
Created on 2019-02-28 by the reprex package (v0.2.1)
If you want to be adventurous though... there is a branch of caret that has recipe integration with the feature selection routines. I'll probably release it to CRAN at the end of March.
It's a little tricky with steps that select/filter variables. The recipe would be remade within each resample (as it should) but you might not have the same set of variables as other resamples.
The new code would be:
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
set.seed(2624)
percent <- 0.80
mtcars <-
mtcars %>%
mutate(am = as.factor(am))
in_train <- createDataPartition(mtcars$am, p = percent, list = FALSE)
train_data <- mtcars[in_train,]
test_data <- mtcars[-in_train,]
log_recipe <-
recipe(formula = am ~ ., data = train_data) %>%
# Nothing selected here and fails
# step_other(all_nominal(), -all_outcomes(), threshold = 0.02, other = "other_assigned ") %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_pca(all_numeric(), -all_outcomes(), num_comp = nrow(train_data)) %>%
# No variables to make dummies
# step_dummy(all_nominal(), -all_outcomes()) %>%
# step_nzv(all_predictors()) %>%
# step_pca() ensures that step_corr() won't select anything
# step_corr(all_numeric()) %>%
# step_lincomb(all_numeric()) %>%
step_naomit(all_predictors()) %>%
# Do the conversion here (skip means it won't fail when predicting)
step_mutate(am = as.factor(am), skip = TRUE)
set.seed(2624)
log.glmRFE <- lrFuncs
log.glmRFE$summary <- twoClassSummary
log_ctrl <- rfeControl(functions = log.glmRFE,
method = "repeatedcv",
number = 10,
repeats = 5,
saveDetails = TRUE,
verbose= FALSE)
log_model <-
rfe(log_recipe,
data = train_data,
sizes = 1:10,
rfeControl = log_ctrl)
#> Warning in rfe.recipe(log_recipe, data = train_data, sizes = 1:10,
#> rfeControl = log_ctrl): Metric 'Accuracy' is not created by the summary
#> function; 'ROC' will be used instead
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# <snip>
log_model
#>
#> Recursive feature selection
#>
#> Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
#>
#> Resampling performance over subset size:
#>
#> Variables ROC Sens Spec ROCSD SensSD SpecSD Num_Resamples Selected
#> 1 0.8250 0.74 0.57 0.3283 0.36756 0.4950 50
#> 2 0.9800 0.99 0.92 0.1414 0.07071 0.2740 50
#> 3 1.0000 0.97 1.00 0.0000 0.15682 0.0000 50 *
#> 4 0.9650 0.89 1.00 0.1598 0.27274 0.0000 50
#> 5 0.9575 0.93 0.95 0.1779 0.22610 0.2082 50
#> 6 0.9750 0.86 0.97 0.1263 0.30372 0.1568 50
#> 7 0.9650 0.88 0.97 0.1750 0.25873 0.1568 50
#> 8 0.9650 0.83 0.97 0.1750 0.32904 0.1568 50
#> 9 0.9500 0.79 0.97 0.2020 0.36547 0.1568 50
#> 10 0.9700 0.79 0.97 0.1485 0.36547 0.1568 50
#>
#> The top 3 variables (out of 3):
#> PC02, PC01, PC03
Created on 2019-02-28 by the reprex package (v0.2.1)