Extracting variable names from a formula

I am looking to run different regressions on a data set, with the number of variables varying across the different models, and return predictions based on a prediction data set. The problem is that I cannot figure out how to use the formulas provided to generate the prediction data set. The following works but is ignorant of what variables are in the model and therefore uses all possible variables for the prediction data frame.

suppressMessages(library(tidyverse))

reg_form_list <- list(
  as.formula(mpg ~ factor(am)),
  as.formula(mpg ~ factor(am)*factor(gear)),
  as.formula(mpg ~ factor(am)*factor(gear)*factor(cyl))
)

reg_predict <- function(df, reg_form) {
  predict_df <- expand(df, am, gear, cyl) %>%
    mutate(
      var_combinations = interaction(am, gear, cyl, sep = "_")
    )

  df <- df %>%
    mutate(
      var_combinations = interaction(am, gear, cyl, drop = TRUE, sep = "_")
    )

  m <- lm(reg_form, data = df)

  tibble(
    predict(m,
            subset(predict_df, var_combinations %in% df$var_combinations)),
    subset(predict_df, var_combinations %in% df$var_combinations)
  ) %>%
    rename(predicted = contains("predict")) %>%
    right_join(predict_df, by = c("am", "gear", "cyl", "var_combinations"))
}

results <- map(reg_form_list, ~ reg_predict(mtcars, .))
#> Warning in predict.lm(m, subset(predict_df, var_combinations %in%
#> df$var_combinations)): prediction from a rank-deficient fit may be misleading

#> Warning in predict.lm(m, subset(predict_df, var_combinations %in%
#> df$var_combinations)): prediction from a rank-deficient fit may be misleading

Created on 2020-05-26 by the reprex package (v0.3.0)

I thought that maybe using all.vars in the function (as shown below) would work, but I cannot figure out how to manipulate the returned vector of strings so it can be used both in expand and in the right_join at the end.

reg_predict <- function(df, reg_form) {
  x_vars <- all.vars(reg_form)[-1]
  predict_df <- expand(df, x_vars) %>%
    mutate(
      var_combinations = interaction(am, gear, cyl, sep = "_")
    )
.
.
.
}

Any suggestions would be much appreciated. Also, if my basic approach of using a list to hold all the models can be improved, I would love to hear about that as well.

1 Like

I need to take more breaks (and drink more coffee :coffee:)! The problem turned out not to be that hard after all. The issue is mostly that I am just still not that comfortable with !!! and syms.

suppressMessages(library(tidyverse))

reg_form_list <- list(
  as.formula(mpg ~ factor(am)),
  as.formula(mpg ~ factor(am)*factor(gear)),
  as.formula(mpg ~ factor(am)*factor(gear)*factor(cyl))
)

reg_predict <- function(df, reg_form) {
  x_vars <- all.vars(reg_form)[-1]

  predict_df <- expand(df, !!!syms(x_vars)) %>%
    mutate(
      var_combinations = interaction(!!!syms(x_vars), sep = "_")
    )

  df <- df %>%
    mutate(
      var_combinations = interaction(!!!syms(x_vars), drop = TRUE, sep = "_")
    )

  m <- lm(reg_form, data = df)

  tibble(
    predict(m,
            subset(predict_df, var_combinations %in% df$var_combinations)),
    subset(predict_df, var_combinations %in% df$var_combinations)
  ) %>%
    rename(predicted = contains("predict")) %>%
    right_join(predict_df, by = c(x_vars, "var_combinations"))
}

map(reg_form_list, ~ reg_predict(mtcars, .))
#> Warning in predict.lm(m, subset(predict_df, var_combinations %in%
#> df$var_combinations)): prediction from a rank-deficient fit may be misleading

#> Warning in predict.lm(m, subset(predict_df, var_combinations %in%
#> df$var_combinations)): prediction from a rank-deficient fit may be misleading
#> [[1]]
#> # A tibble: 2 x 3
#>   predicted    am var_combinations
#>       <dbl> <dbl> <fct>           
#> 1      17.1     0 0               
#> 2      24.4     1 1               
#> 
#> [[2]]
#> # A tibble: 6 x 4
#>   predicted    am  gear var_combinations
#>       <dbl> <dbl> <dbl> <fct>           
#> 1      16.1     0     3 0_3             
#> 2      21.0     0     4 0_4             
#> 3      NA       0     5 0_5             
#> 4      NA       1     3 1_3             
#> 5      26.3     1     4 1_4             
#> 6      21.4     1     5 1_5             
#> 
#> [[3]]
#> # A tibble: 18 x 5
#>    predicted    am  gear   cyl var_combinations
#>        <dbl> <dbl> <dbl> <dbl> <fct>           
#>  1      21.5     0     3     4 0_3_4           
#>  2      19.8     0     3     6 0_3_6           
#>  3      15.0     0     3     8 0_3_8           
#>  4      23.6     0     4     4 0_4_4           
#>  5      18.5     0     4     6 0_4_6           
#>  6      NA       0     4     8 0_4_8           
#>  7      NA       0     5     4 0_5_4           
#>  8      NA       0     5     6 0_5_6           
#>  9      NA       0     5     8 0_5_8           
#> 10      NA       1     3     4 1_3_4           
#> 11      NA       1     3     6 1_3_6           
#> 12      NA       1     3     8 1_3_8           
#> 13      28.0     1     4     4 1_4_4           
#> 14      21.0     1     4     6 1_4_6           
#> 15      NA       1     4     8 1_4_8           
#> 16      28.2     1     5     4 1_5_4           
#> 17      19.7     1     5     6 1_5_6           
#> 18      15.4     1     5     8 1_5_8

Created on 2020-05-26 by the reprex package (v0.3.0)

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.