Extending recipes:: to create a new step function using wavelets::dwt

Another tidymodels question that I suspect I'm again missing a point in the documentation:

I'm trying out implementing discrete wavelet transformation as a pca-like step that reduces/transforms a data frame where each row is a time series, into its DWT coefficients. Partially as an exercise in itself - I'm not even sure this is a particularly good idea! My new recipe step creates successfully, but when I try to run prep() I get the following:

Error: Not all functions are allowed in step function selectors (e.g. `UseMethod`). See ?selections.
Run `rlang::last_error()` to see where the error occurred. 

The backtrace looks like this:

<error/rlang_error>
Not all functions are allowed in step function selectors (e.g. `UseMethod`). See ?selections.
Backtrace:
     x
  1. \-recipe_check %>% prep()
  2.   +-base::withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
  3.   \-base::eval(quote(`_fseq`(`_lhs`)), env, env)
  4.     \-base::eval(quote(`_fseq`(`_lhs`)), env, env)
  5.       \-`_fseq`(`_lhs`)
  6.         \-magrittr::freduce(value, `_function_list`)
  7.           +-base::withVisible(function_list[[k]](value))
  8.           \-function_list[[k]](value)
  9.             +-recipes::prep(.)
 10.             \-recipes:::prep.recipe(.)
 11.               +-recipes::prep(x$steps[[i]], training = training, info = x$term_info)
 12.               \-global::prep.step_dwt(x$steps[[i]], training = training, info = x$term_info)
 13.                 \-recipes::terms_select(terms = x$terms, info = info)
 14.                   \-base::lapply(terms, element_check)
 15.              

I think I'm probably creating the prep object incorrectly, but not sure where. Any guidance back onto the garden path would be appreciated! Here's the full code:

library(tidyverse); library(tidymodels); library(wavelets);

# create an arbitrary set of ar = .2 time series, for testing.
n <- 10
ts_length <- 100
arima_terms <- list(ar = .2)
l <- map(1:n, ~ {
  set.seed(.x)
  return(tibble(id = .x, idx = 1:ts_length, ts = arima.sim(model = arima_terms, n = ts_length)))
})

# bind them together and pivot them so each row is a series, each column is a position
l_df <- bind_rows(l)
df <- pivot_wider(l_df, id_cols = c(id), names_from = c(idx), values_from = c(ts), names_prefix = "ts_")

# two helper functions - the first extracts the coefficients from a vector
extract_dwt_vector <- function(x, filter) {
  dwt_out <- dwt(x, filter = filter)
  dwt_vars <- unlist(c(dwt_out@W,dwt_out@V[[dwt_out@level]]))
  return(dwt_vars)
}
# the second tries to efficiently repeat the process over a dataframe
map_dwt_over_df <- function(df, filter) {
  map_dfr(1:nrow(df), ~ extract_dwt_vector(df[.x,] %>% as.numeric(), filter = filter) %>% {.[names(.) != ""]})
}

# Here we start creating the step!
step_dwt <- function(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  ref_dist = NULL,
  options = list(filter = "haar"),
  skip = FALSE,
  id = rand_id("dwt")
) {

  add_step(
    recipe,
     step_dwt_new(
       terms = terms,
       trained = trained,
       role = role,
       ref_dist = ref_dist,
       options = options,
       skip = skip,
       id = id
     )
  )
}

step_dwt_new <-
  function(terms, role, trained, ref_dist, options, skip, id) {
    step(
      subclass = "dwt",
      terms = terms,
      role = role,
      trained = trained,
      ref_dist = ref_dist,
      options = options,
      skip = skip,
      id = id
    )
  }

# I suspect here's where I'm getting unstuck - I don't know enough about
# rlang to really be sure I know what's happening here
# or that I'm implementing the function correctly

get_train_dwt <- function(x, args = NULL) {
  res <- rlang::exec("map_dwt_over_df", x = x, !!!args)
  return(res)
}

prep.step_dwt <- function(x, training, info = NULL, ...) {
  col_names <- terms_select(terms = x$terms, info = info)
  ref_dist <- purrr::map(training[, col_names],  get_train_dwt, args = x$options)
  step_percentile_new(
    terms = x$terms,
    trained = TRUE,
    role = x$role,
    ref_dist = ref_dist,
    options = x$options,
    skip = x$skip,
    id = x$id
  )
}
# very naive and probably bad step where we re-estimate the coefs from new data
bake.step_dwt <- function(object, new_data, ...) {
  new_data <- purrr::map(new_data[,object$terms],  get_train_dwt, args = object$options)
  as_tibble(new_data)
}
# this works fine
recipe_check <- recipe(id ~ ., data = df) %>%
  step_dwt(all_predictors())
# then this fails!
recipe_check %>%
  prep()

When terms is added to add_step(), it does not yet exist so it picks up stats::terms() from the environment (which leads to the bizarre message). Pre-define it before calling add_step():


# Here we start creating the step!
step_dwt <- function(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  ref_dist = NULL,
  options = list(filter = "haar"),
  skip = FALSE,
  id = rand_id("dwt")
) {
  
  # ADD THIS BIT
  terms <- ellipse_check(...) 

  add_step(
    recipe,
    step_dwt_new(
      terms = terms,
      trained = trained,
      role = role,
      ref_dist = ref_dist,
      options = options,
      skip = skip,
      id = id
    )
  )
}

Also, looking at the comments, this might be easier to use:

get_train_dwt <- function(x, args = NULL) {
  res <- map_dwt_over_df(x, !!!args)
  return(res)
}

However, there is still an issue since (I think) that map_dwt_over_df() expects a data frame but is being given a vector.

I think I got it! I think technically the prep step could be almost entirely empty, since you can't (as far as I can tell) 'train' discrete wavelet transformation and then apply that training to an unseen dataset.

Part of my eventual confusion was about getting the correct column names - I was using step_pca as a reference (since it's sort of a similar concept) - in the prep step, those are accessible through

vars <- terms_select(terms = x$terms, info = info)

but in the bake step, both step_pca and the example tutorial get them from the trained object:

vars <- names(object$ref_dist)

Very much out on the thin limb of my knowledge here, so, I could very well be doing everything in a round-about way.

In any case, here's the code I got to working - again, hopefully this is useful to anyone else like me working their way through this.

  1. The class creation:
library(tidyverse); library(tidymodels); library(wavelets);

# two helper functions - the first extracts the coefficients from a vector
extract_dwt_vector <- function(x, filter) {
  dwt_out <- dwt(x, filter = filter)
  dwt_vars <- unlist(c(dwt_out@W,dwt_out@V[[dwt_out@level]]))
  return(dwt_vars)
}
# the second tries to efficiently repeat the process over a dataframe
map_dwt_over_df <- function(df, filter) {
  map_dfr(1:nrow(df), ~ extract_dwt_vector(df[.x,] %>% as.numeric(), filter = filter) %>% {.[names(.) != ""]})
}


## Create the classes

step_dwt <- function(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  ref_dist = NULL,
  options = list(filter = "haar"),
  skip = FALSE,
  id = rand_id("dwt")
) {

  ## The variable selectors are not immediately evaluated by using
  ##  the `quos()` function in `rlang`. `ellipse_check()` captures
  ##  the values and also checks to make sure that they are not empty.
  terms <- ellipse_check(...)

  add_step(
    recipe,
    step_dwt_new(
      terms = terms,
      trained = trained,
      role = role,
      ref_dist = ref_dist,
      options = options,
      skip = skip,
      id = id
    )
  )
}

step_dwt_new <-
  function(terms, role, trained, ref_dist, options, skip, id) {
    step(
      subclass = "dwt",
      terms = terms,
      role = role,
      trained = trained,
      ref_dist = ref_dist,
      options = options,
      skip = skip,
      id = id
    )
  }


prep.step_dwt <- function(x, training, info = NULL, ...) {
  col_names <- terms_select(terms = x$terms, info = info)
  ## We actually only need the training set here
  ## Since there's nothing about the trained data that's useful
  ## you could probably even just return the variable names?
  ref_dist <- training[,col_names]

  ## Use the constructor function to return the updated object.
  ## Note that `trained` is now set to TRUE

  step_dwt_new(
    terms = x$terms,
    trained = TRUE,
    role = x$role,
    ref_dist = ref_dist,
    options = x$options,
    skip = x$skip,
    id = x$id
  )
}

bake.step_dwt <- function(object, new_data, ...) {
  ## I use expr(), mod_call_args and eval to evaluate map_dwt
  ## this probably is a little aroundabout?
  vars <- names(object$ref_dist)
  dwt_call <- expr(map_dwt_over_df(filter = NULL))
  dwt_call <- recipes:::mod_call_args(dwt_call, args = object$options)
  dwt_call$df <- expr(new_data[,vars])

  new_data_cols <- eval(dwt_call)
  new_data <- bind_cols(new_data, as_tibble(new_data_cols))
  ## get rid of the original columns
  ## -vars will not do this!
  new_data <-
    new_data[, !(colnames(new_data) %in% vars), drop = FALSE]

  ## Always convert to tibbles on the way out
  tibble::as_tibble(new_data)
}

And then a (silly) example of applying it:

create_data <- function(n, ts_length, arima_terms = list(ar = .2)) {

  # create an arbitrary set of ar = .2 time series, for testing.

  l <- map(1:n, ~ {
    set.seed(.x)
    return(tibble(id = .x, idx = 1:ts_length, ts = arima.sim(model = arima_terms, n = ts_length)))
  })

  # bind them together and pivot them so each row is a series, each column is a position
  l_df <- bind_rows(l)
  df <- pivot_wider(l_df, id_cols = c(id), names_from = c(idx), values_from = c(ts), names_prefix = "ts_")
  return(df)

}

df <- create_data(n = 100, ts_length = 100)

## Testing

# this works fine
recipe_check <- recipe(id ~ ., data = df) %>%
   step_dwt(ends_with("1"), ends_with("2"))

training_data <- recipe_check %>%
  prep() %>%
  juice()

model_test <- rand_forest(mode = "regression") %>%
  set_engine("ranger")

## a silly model, since id is an index, but this is just a test
fit(model_test, 
    formula = id ~ ., 
    data = training_data, 
    control = control_parsnip(verbosity = TRUE))

## now try bootstrapping, just to make sure
bs_df <- bootstraps(df, times = 5)

wf <- 
  workflow() %>%
  add_model(model_test) %>%
  add_recipe(recipe_check)

results <- 
  fit_resamples(object = wf, 
                resamples = bs_df, 
                control = control_resamples(verbose = TRUE))

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.