I figured out a nicer way to do this.
Create a registry like so:
step_feature <- function(step_fn, step_name, deps, args) {
l <- list(
list(
step_fn = rlang::enexpr(step_fn),
deps = deps,
args = list(rlang::enexpr(args))
)
)
names(l) <- step_name
names(l[[1]]$args) <- step_name
l
}
step_mutate_feat <- purrr::partial(step_feature, step_fn = recipes::step_mutate)
feature_registry <- c(
step_mutate_feat("hour", "created_at", lubridate::hour(created_at)),
....
)
And then select only the features you need and reduce multiple calls to each step function
build_recipe_call_factory <- function(feature_registry) {
function(recipe, feature) {
rlang::call2(
features[[feature]]$step_fn,
recipe,
!!!features[[feature]]$args
)
}
}
recipe_call <- feature_and_deps_names_used %>%
purrr::reduce(
build_recipe_call,
.init = unprepped_recipe_init
)
unprepped_recipe_full <- eval(recipe_call)
and finally add a step_rm to remove the features you don't need
unprepped_recipe <- unprepped_recipe_full %>%
recipes::step_rm(
dplyr::all_of(unneeded_recipe_vars)
)
This way a single pipeline can train models with different preprocessing (in the form of {recipe} steps.
I'm not sure yet if there is any performance hit in production from the fact we have so many step_mutate instead of a single large one.
This is far from a reprex. I can make one if anyone shows interest.