I think I got it! I think technically the prep step could be almost entirely empty, since you can't (as far as I can tell) 'train' discrete wavelet transformation and then apply that training to an unseen dataset.
Part of my eventual confusion was about getting the correct column names - I was using step_pca as a reference (since it's sort of a similar concept) - in the prep step, those are accessible through
vars <- terms_select(terms = x$terms, info = info)
but in the bake step, both step_pca and the example tutorial get them from the trained object:
vars <- names(object$ref_dist)
Very much out on the thin limb of my knowledge here, so, I could very well be doing everything in a round-about way.
In any case, here's the code I got to working - again, hopefully this is useful to anyone else like me working their way through this.
- The class creation:
library(tidyverse); library(tidymodels); library(wavelets);
# two helper functions - the first extracts the coefficients from a vector
extract_dwt_vector <- function(x, filter) {
dwt_out <- dwt(x, filter = filter)
dwt_vars <- unlist(c(dwt_out@W,dwt_out@V[[dwt_out@level]]))
return(dwt_vars)
}
# the second tries to efficiently repeat the process over a dataframe
map_dwt_over_df <- function(df, filter) {
map_dfr(1:nrow(df), ~ extract_dwt_vector(df[.x,] %>% as.numeric(), filter = filter) %>% {.[names(.) != ""]})
}
## Create the classes
step_dwt <- function(
recipe,
...,
role = NA,
trained = FALSE,
ref_dist = NULL,
options = list(filter = "haar"),
skip = FALSE,
id = rand_id("dwt")
) {
## The variable selectors are not immediately evaluated by using
## the `quos()` function in `rlang`. `ellipse_check()` captures
## the values and also checks to make sure that they are not empty.
terms <- ellipse_check(...)
add_step(
recipe,
step_dwt_new(
terms = terms,
trained = trained,
role = role,
ref_dist = ref_dist,
options = options,
skip = skip,
id = id
)
)
}
step_dwt_new <-
function(terms, role, trained, ref_dist, options, skip, id) {
step(
subclass = "dwt",
terms = terms,
role = role,
trained = trained,
ref_dist = ref_dist,
options = options,
skip = skip,
id = id
)
}
prep.step_dwt <- function(x, training, info = NULL, ...) {
col_names <- terms_select(terms = x$terms, info = info)
## We actually only need the training set here
## Since there's nothing about the trained data that's useful
## you could probably even just return the variable names?
ref_dist <- training[,col_names]
## Use the constructor function to return the updated object.
## Note that `trained` is now set to TRUE
step_dwt_new(
terms = x$terms,
trained = TRUE,
role = x$role,
ref_dist = ref_dist,
options = x$options,
skip = x$skip,
id = x$id
)
}
bake.step_dwt <- function(object, new_data, ...) {
## I use expr(), mod_call_args and eval to evaluate map_dwt
## this probably is a little aroundabout?
vars <- names(object$ref_dist)
dwt_call <- expr(map_dwt_over_df(filter = NULL))
dwt_call <- recipes:::mod_call_args(dwt_call, args = object$options)
dwt_call$df <- expr(new_data[,vars])
new_data_cols <- eval(dwt_call)
new_data <- bind_cols(new_data, as_tibble(new_data_cols))
## get rid of the original columns
## -vars will not do this!
new_data <-
new_data[, !(colnames(new_data) %in% vars), drop = FALSE]
## Always convert to tibbles on the way out
tibble::as_tibble(new_data)
}
And then a (silly) example of applying it:
create_data <- function(n, ts_length, arima_terms = list(ar = .2)) {
# create an arbitrary set of ar = .2 time series, for testing.
l <- map(1:n, ~ {
set.seed(.x)
return(tibble(id = .x, idx = 1:ts_length, ts = arima.sim(model = arima_terms, n = ts_length)))
})
# bind them together and pivot them so each row is a series, each column is a position
l_df <- bind_rows(l)
df <- pivot_wider(l_df, id_cols = c(id), names_from = c(idx), values_from = c(ts), names_prefix = "ts_")
return(df)
}
df <- create_data(n = 100, ts_length = 100)
## Testing
# this works fine
recipe_check <- recipe(id ~ ., data = df) %>%
step_dwt(ends_with("1"), ends_with("2"))
training_data <- recipe_check %>%
prep() %>%
juice()
model_test <- rand_forest(mode = "regression") %>%
set_engine("ranger")
## a silly model, since id is an index, but this is just a test
fit(model_test,
formula = id ~ .,
data = training_data,
control = control_parsnip(verbosity = TRUE))
## now try bootstrapping, just to make sure
bs_df <- bootstraps(df, times = 5)
wf <-
workflow() %>%
add_model(model_test) %>%
add_recipe(recipe_check)
results <-
fit_resamples(object = wf,
resamples = bs_df,
control = control_resamples(verbose = TRUE))