Many models with overlapping groups


#1

I’m working with a dataset with binary variables defining a number of different but overlapping groups (for example youth, female, certified), as well as arbitrary variables x and y. I’d like to run the same model for each group as well as a combined group (e.g. female AND youth).

While it’s certainly not hard to copy paste the same code for each group: lm(y ~ x, data = filter(df, female == 1)), etc for each group, I’m wondering if there’s a way to do this similar to the Many Models approach in R4DS?

Hopefully this reprex helps:

library(dplyr)
library(tidyr)
library(tibble)

# Make data
df <- data.frame(
  x = rnorm(5),
  y = rnorm(5),
  female = c(0, 0, 1, 1, 0),
  youth = c(1, 0, 1, 0, 1),
  certified = c(0, 0, 1, 0, 1)
) %>% as.tibble()
df
#> # A tibble: 5 x 5
#>            x          y female youth certified
#>        <dbl>      <dbl>  <dbl> <dbl>     <dbl>
#> 1  0.6404664  0.4630325      0     1         0
#> 2  1.1642970  0.2696488      0     0         0
#> 3  0.1881423  0.9081086      1     1         1
#> 4 -0.4378891 -1.7633724      1     0         0
#> 5  1.1185876  0.3726534      0     1         1

# nest
df.nested <- df %>%
  nest(-female, -youth, -certified)

# So I want to fit a model on each group (female, youth, certified), as well as
# an overlapping group (female AND youth).
# Can this be done without copy-pasting for each group?
female_model <- df.nested %>%
  filter(female == 1) %>%
  unnest() %>%
  lm(y ~ x, data = .)
youth_model <- df.nested %>%
  filter(youth == 1) %>%
  unnest() %>%
  lm(y ~ x, data = .)
certified_model <- df.nested %>%
  filter(certified == 1) %>%
  unnest() %>%
  lm(y ~ x, data = .)
femaleXyouth_model <- df.nested %>%
  filter(female == 1,
         youth == 1) %>%
  unnest() %>%
  lm(y ~ x, data = .)

# Ideally I'd end up with a tibble something like this, where model contains the
# lm object for each group
as_tibble(list(
  model = list(
    female_model,
    youth_model,
    certified_model,
    femaleXyouth_model),
  group = c("female", "youth", "certified", "femaleXyouth"),
  date = Sys.Date()))
#> # A tibble: 4 x 3
#>      model        group       date
#>     <list>        <chr>     <date>
#> 1 <S3: lm>       female 2017-10-03
#> 2 <S3: lm>        youth 2017-10-03
#> 3 <S3: lm>    certified 2017-10-03
#> 4 <S3: lm> femaleXyouth 2017-10-03

#2

Answering myself far too quickly, but I guess I could set up a function that is hard-coded to split data into my subgroups and returns a tibble with a data column for each group, and then map the model on to each group.

library(dplyr)
library(tidyr)
library(tibble)
library(purrr)

# Make data
df <- data.frame(
  x = rnorm(5),
  y = rnorm(5),
  female = c(0, 0, 1, 1, 0),
  youth = c(1, 0, 1, 0, 1),
  certified = c(0, 0, 1, 0, 1)
) %>% as.tibble()
df
#> # A tibble: 5 x 5
#>             x          y female youth certified
#>         <dbl>      <dbl>  <dbl> <dbl>     <dbl>
#> 1 -0.73287239  0.5971601      0     1         0
#> 2 -1.96281477 -0.3821349      0     0         0
#> 3 -0.06460562  2.5102180      1     1         1
#> 4  1.68343337 -1.8117090      1     0         0
#> 5  0.67020268 -0.9009155      0     1         1

# Helper function that creates a list where each item is a tibble 
# containing the desired group
test_split <- function(df) {
  as_tibble(list(
    data = list(
      female = filter(df, female == 1),
      youth = filter(df, youth == 1),
      certified = filter(df, certified == 1),
      femaleXyouth = filter(df, female == 1, youth == 1)),
    group = c("female", "youth", "certified", "femaleXyouth"),
    date = Sys.Date()))
}

# Helper function for modeling
test_model <- function(df) {
  lm(y ~ x, data = df)
}

# And go!
df %>%
  test_split() %>%
  mutate(model = map(data, test_model))
#> # A tibble: 4 x 4
#>               data        group       date    model
#>             <list>        <chr>     <date>   <list>
#> 1 <tibble [2 x 5]>       female 2017-10-03 <S3: lm>
#> 2 <tibble [3 x 5]>        youth 2017-10-03 <S3: lm>
#> 3 <tibble [2 x 5]>    certified 2017-10-03 <S3: lm>
#> 4 <tibble [1 x 5]> femaleXyouth 2017-10-03 <S3: lm>

I don’t love that this creates duplicate observations across list-column elements, but I think this is an otherwise pretty straightforward solution.


#3

So you don’t want all overlapping groups?


#4

Not necessarily, I just used femaleXyouth as a quick example.

Also, I may have used “overlapping” in two separate and unclear ways:

  1. An observation can be just female, both female and young, etc. In the case where an observation is female and young, I’d like them to appear in the female model, the youth model, and the femaleXyouth model.
  2. I want also want to create a model containing observations that are female OR young OR both.

#5

Just taking your example with test_split and test_model, we could wrote something shorter using iteration with purrr

library(dplyr, warn.conflicts = F)

# Make data
df <- data_frame(
  x = rnorm(5),
  y = rnorm(5),
  female = c(0, 0, 1, 1, 0),
  youth = c(1, 0, 1, 0, 1),
  certified = c(0, 0, 1, 0, 1)
)
df
#> # A tibble: 5 x 5
#>            x           y female youth certified
#>        <dbl>       <dbl>  <dbl> <dbl>     <dbl>
#> 1  0.7153600 -0.06177455      0     1         0
#> 2  1.5145334  0.12900287      0     0         0
#> 3 -0.6076690  1.11141032      1     1         1
#> 4  0.5183133  1.21030749      1     0         0
#> 5 -1.7340227  1.59981898      0     1         1

library(purrr)
lst(
  female_mod = . %>% filter(female == 1),
  youth_mod = . %>% filter(youth == 1),
  certified_mod  = . %>% filter(certified == 1),
  femaleXyouth = . %>% filter(female == 1, youth == 1)
) %>% 
  map_dfr(~ tidyr::nest(.x(df)), .id = "group") %>% 
  mutate(mod = map(data, ~ lm(y~x, data = .x)))
#> # A tibble: 4 x 3
#>           group             data      mod
#>           <chr>           <list>   <list>
#> 1    female_mod <tibble [2 x 5]> <S3: lm>
#> 2     youth_mod <tibble [3 x 5]> <S3: lm>
#> 3 certified_mod <tibble [2 x 5]> <S3: lm>
#> 4  femaleXyouth <tibble [1 x 5]> <S3: lm>
  1. create a list a function to filter df with
  2. iterate df through this list to get a list of subsetted df, that you nest before combining in a nested df
  3. apply a model on each data by group

It does not solve your problem I think but it is just another way of doing this.


#6

Thanks! I do really like this approach, and lst is new to me.


#7

@ben-e Is your problem solved, or are you still searching for a more general solution.

I thought that sometimes it is good to “physically” create the data, if it doesn’t become too big.
Things will be easier to test and errors become more obvious. Also code written on top of it might become easier (maybe some tidyr magic for the many models…).

So the idea is to create columns for each variable combination that can occur.
This can give quite a lot of headache, but if we just combine the names via utils::combn(),
it turns out that this is a cool example for non-standard evaluation:

  • create the expressions as a string (expression will be a combination of variable names and a binary numeric or logical operator like "+", “*”, “&” or "|")
  • create the names (since expressions might be bad names)
  • evaluate the expressions in the context of your data and assign it to a new object (and name it of course)

Maybe one can adjust the function to take arbitrary integer numbers/sequences for m, to calculate only combinations of specific orders.

#   ____________________________________________________________________________
#   Load libraries
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))
#   ____________________________________________________________________________
#   create test data
set.seed(123)
df_test <- tibble(a = sample(0:1,6,T),
                  b = sample(0:1,6,T),
                  c = sample(0:1,6,T),
                  d = sample(0:1,6,T))
#   ____________________________________________________________________________
#   Function combine_binary
#   data: data.frame with binary entries (logical or coercable to logical)
#   bin_op: string, binary operator
#   sep: string, used to combine names of binary operators

combine_binary <- function(data, bin_op, sep) {
##  ............................................................................
  #   create combinations
  comb <- seq_along(data) %>% 
    map(~ combn(names(data), m = .x)) %>%
    map(~ as_tibble(.x))
  #   create expressions to evaluate
  comb_expressions <- comb %>% 
    map(~ map_chr(.x, ~ paste(.x, collapse = bin_op))) %>% 
    flatten_chr()
  #   create names for expressions
  comb_names <- comb %>%
    map(~ map_chr(.x, ~ paste(.x, collapse = sep))) %>% 
    flatten_chr()
##  ............................................................................
  #   validate expressions, set names,assign to tibble
  #   and return
  map(comb_expressions,
                       ~ eval(parse(text = .x), envir = data)) %>%
    map_if(is_logical, as.integer) %>% 
    set_names(comb_names) %>% 
    as_tibble()
}
#   ____________________________________________________________________________

And test it:

combine_binary(df_test, bin_op = " & ", sep = "_and_")
#> # A tibble: 6 x 15
#>       a     b     c     d a_and_b a_and_c a_and_d b_and_c b_and_d c_and_d
#>   <int> <int> <int> <int>   <int>   <int>   <int>   <int>   <int>   <int>
#> 1     0     1     1     0       0       0       0       1       0       0
#> 2     1     1     1     1       1       1       1       1       1       1
#> 3     0     1     0     1       0       0       0       0       1       0
#> 4     1     0     1     1       0       1       1       0       0       1
#> 5     1     1     0     1       1       0       1       0       1       0
#> 6     0     0     0     1       0       0       0       0       0       0
#> # ... with 5 more variables: a_and_b_and_c <int>, a_and_b_and_d <int>,
#> #   a_and_c_and_d <int>, b_and_c_and_d <int>, a_and_b_and_c_and_d <int>
combine_binary(df_test, bin_op = " | ", sep = "_or_")
#> # A tibble: 6 x 15
#>       a     b     c     d a_or_b a_or_c a_or_d b_or_c b_or_d c_or_d
#>   <int> <int> <int> <int>  <int>  <int>  <int>  <int>  <int>  <int>
#> 1     0     1     1     0      1      1      0      1      1      1
#> 2     1     1     1     1      1      1      1      1      1      1
#> 3     0     1     0     1      1      0      1      1      1      1
#> 4     1     0     1     1      1      1      1      1      1      1
#> 5     1     1     0     1      1      1      1      1      1      1
#> 6     0     0     0     1      0      0      1      0      1      1
#> # ... with 5 more variables: a_or_b_or_c <int>, a_or_b_or_d <int>,
#> #   a_or_c_or_d <int>, b_or_c_or_d <int>, a_or_b_or_c_or_d <int>