Converting cut-and paste to a calling function

I have written interactive code that works, but I am struggling to write a function that can take the place of my copy-and-paste efforts.

The ingredients:
a data frame with annual numerator data.
Other columns contain demographic data.
I need to compute three year running totals for the numerator
My code is found in the function func_3()

I do not know what goes inside the parentheses in the following cases.

I do not know how to use parameters in this setting. Otherwise, I am OK.
group_by(what do I put in here?)
arrange(what do I put in here?)
mutate(what do I put in here?)
select(what do I put in here?)

library(purrr)
library(dplyr)
library(tidyr)
library(slider)  # for grouping consecutive years


# The sample data
set.seed(2021)

EVENT_YEAR = 2010:2015
RE = c('white', 'black', 'Asian')
City = c('Oakland', 'San Francisco', 'San Jose')
Note = 1:3

demoDF_N = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDF_N$Numerator = sample(3:10, 162, replace = TRUE)

Sample calls

# Function calls
res1 = func_3(demoDF_N, demoDF_D, EVENT_YEAR)
res2 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, RE)
res3 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, City)
res4 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, City, RE)

This is the function that needs fixing

func_3 = function(df1, ... ){ 
  result_3_N = df1 %>%    # 3-year running average
    group_by(Note, ...) %>%               # Need help inside all (  )
    arrange(Note,..., EVENT_YEAR) %>%
    mutate(Numerator_UPDATED = slider::slide_dbl(Numerator, sum, .before = 1, .after = 1, .complete = TRUE)) %>%
    select(Note,..., EVENT_YEAR, Numerator_UPDATED) %>% ungroup()
  
  result_3_N = result_3_N %>% rename(Numerator = Numerator_UPDATED) # simple rename
  
  result_3_N = result_3_N %>% filter(!is.na(Numerator))  # filter out rows not based on 3 full years
  
  # Get EVENT_YEAR to display range of years, e.g., 2008-2010
  result_3_N$EVENT_YEAR = 
    paste(as.integer(as.character(result_3_N$EVENT_YEAR)) - 1, '-', 
          as.integer(as.character(result_3_N$EVENT_YEAR)) + 1, sep = '')
  
 result_3_N

}

A few questions for clarification:

  1. Can you show us what you expect the output to be from at least one of the function calls?

  2. Do you want the ... to represent the same data column(s) every time it appears inside the function?

  3. What is demoDF_D? It appears inside each function call, but it's undefined.

For now, here's one way to pass arguments into the function, on the assumption that we want to reference the exact same columns each time ... appears in your original function.

func1 = function(df1, groups){ 
  result_3_N = df1 %>%    
    group_by(Note, across({{groups}})) %>%               
    arrange(Note, across({{groups}}), EVENT_YEAR) %>%
    mutate(Numerator_UPDATED = slide_dbl(Numerator, sum, .before = 1, .after = 1, 
                                         .complete = TRUE)) %>%
    select(Note, {{groups}}, EVENT_YEAR, Numerator_UPDATED) %>% 
    ungroup()
  
  result_3_N
}

res1 = func1(demoDF_N, EVENT_YEAR)
res1
#> # A tibble: 162 x 3
#>     Note EVENT_YEAR Numerator_UPDATED
#>    <int>      <int>             <dbl>
#>  1     1       2010                NA
#>  2     1       2010                23
#>  3     1       2010                20
#>  4     1       2010                21
#>  5     1       2010                20
#>  6     1       2010                20
#>  7     1       2010                21
#>  8     1       2010                22
#>  9     1       2010                NA
#> 10     1       2011                NA
#> # … with 152 more rows

res2 = func1(demoDF_N, c(EVENT_YEAR, RE))
res2
#> # A tibble: 162 x 4
#>     Note EVENT_YEAR RE    Numerator_UPDATED
#>    <int>      <int> <fct>             <dbl>
#>  1     1       2010 white                NA
#>  2     1       2010 white                21
#>  3     1       2010 white                NA
#>  4     1       2010 black                NA
#>  5     1       2010 black                21
#>  6     1       2010 black                NA
#>  7     1       2010 Asian                NA
#>  8     1       2010 Asian                23
#>  9     1       2010 Asian                NA
#> 10     1       2011 white                NA
#> # … with 152 more rows

res3 = func1(demoDF_N, c(EVENT_YEAR, City))
res3 
#> # A tibble: 162 x 4
#>     Note EVENT_YEAR City          Numerator_UPDATED
#>    <int>      <int> <fct>                     <dbl>
#>  1     1       2010 Oakland                      NA
#>  2     1       2010 Oakland                      23
#>  3     1       2010 Oakland                      NA
#>  4     1       2010 San Francisco                NA
#>  5     1       2010 San Francisco                20
#>  6     1       2010 San Francisco                NA
#>  7     1       2010 San Jose                     NA
#>  8     1       2010 San Jose                     22
#>  9     1       2010 San Jose                     NA
#> 10     1       2011 Oakland                      NA
#> # … with 152 more rows

res4 = func1(demoDF_N, c(EVENT_YEAR, City, RE))
res4
#> # A tibble: 162 x 5
#>     Note EVENT_YEAR City          RE    Numerator_UPDATED
#>    <int>      <int> <fct>         <fct>             <dbl>
#>  1     1       2010 Oakland       white                NA
#>  2     1       2010 Oakland       black                NA
#>  3     1       2010 Oakland       Asian                NA
#>  4     1       2010 San Francisco white                NA
#>  5     1       2010 San Francisco black                NA
#>  6     1       2010 San Francisco Asian                NA
#>  7     1       2010 San Jose      white                NA
#>  8     1       2010 San Jose      black                NA
#>  9     1       2010 San Jose      Asian                NA
#> 10     1       2011 Oakland       white                NA
#> # … with 152 more rows

Thanks for asking for a clarification.

I have data (Numerator and Denominator) by year and by various grouping variables.

First I need to produce three-year running totals for the numerator and for the denominator based on various groupings. Once I have the three year totals. I need to join the two tables. Then I perform various computes with basic_compute. The function basic_compute has two parameters – numerator, denominator.

The issue is multiple groupings are possible. I want one function to handle all possible groupings and the possibility that the numerator and denominator data are either in the same data frame or in different data frames..

After looking at Chapter 17, of Advanced R, I came up with the following code with both the numerator data and the denominator in the same data frame (but that is definitely not always the case).
[17 Big picture | Advanced R](https://Advanced R)

The following code is a partial solution, but is not general enough.

library(purrr)
library(dplyr)
library(tidyr)
library(slider)  # for grouping consecutive years


# The sample data
set.seed(2021)

EVENT_YEAR = 2010:2015
RE = c('white', 'black', 'Asian')
City = c('Oakland', 'San Francisco', 'San Jose')
Note = 1:3

demoDF_N = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDF_N$Numerator = sample(3:10, 162, replace = TRUE)
demoDF_N$EVENT_YEAR = as.factor(demoDF_N$EVENT_YEAR)
demoDF_N$RE = as.factor(demoDF_N$RE)
demoDF_N$City = as.factor(demoDF_N$City)
demoDF_N$Note = as.factor(demoDF_N$Note)
class(demoDF_N$Note)

demoDF_D = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDF_N$Denominator = sample(90:120, 162, replace = TRUE) 



# Function calls
res1 = func_3(demoDF_N, demoDF_D, EVENT_YEAR)
res2 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, RE)
res3 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, City)
res4 = func_3(demoDF_N, demoDF_D, EVENT_YEAR, City, RE)

sum(demoDF$Numerator)


test_function = function(df1, grp1, grp2) {
  grp1 <- enexpr(grp1)
  grp2 <- enexpr(grp2)
  
  result_3_N = df1 %>% # 3-year running average
  group_by(!!grp1)   %>%
  arrange(!!grp1, !!grp2) %>%
  mutate(Numerator_UPDATED = slider::slide_dbl(Numerator, sum, .before = 1, .after = 1, .complete = TRUE)) %>%
  select(grp1, grp2, Numerator_UPDATED) %>% ungroup()
  
  result_3_N = result_3_N %>% rename(Numerator = Numerator_UPDATED) # simple rename
  
  result_3_N = result_3_N %>% filter(!is.na(Numerator))  # filter out rows not based on 3 full years
  
  # Get EVENT_YEAR to display range of years, e.g., 2008-2010
  result_3_N$EVENT_YEAR = 
    paste(as.integer(as.character(result_3_N$EVENT_YEAR)) - 1, '-', 
          as.integer(as.character(result_3_N$EVENT_YEAR)) + 1, sep = '')
  
  
  
  result_3_D = df1 %>% # 3-year running average
    group_by(!!grp1)   %>%
    arrange(!!grp1, !!grp2) %>%
    mutate(Denominator_UPDATED = slider::slide_dbl(Denominator, sum, .before = 1, .after = 1, .complete = TRUE)) %>%
    select(grp1, grp2, Denominator_UPDATED) %>% ungroup()
  
  result_3_D = result_3_D %>% rename(Denominator = Denominator_UPDATED) # simple rename
  
  result_3_D = result_3_D %>% filter(!is.na(Denominator))  # filter out rows not based on 3 full years
  
  # Get EVENT_YEAR to display range of years, e.g., 2008-2010
  result_3_D$EVENT_YEAR = 
    paste(as.integer(as.character(result_3_D$EVENT_YEAR)) - 1, '-', 
          as.integer(as.character(result_3_D$EVENT_YEAR)) + 1, sep = '')
  
  # Trying to join the results; but by = c(...)) NOT working
  result3 = result_3_N %>% right_join(result_3_D, by = c(as.character(grp1), as.character(grp2))) %>%
    replace_na(list(Denominator = 0)) %>% replace_na(list(Numerator = 0))
  
  result3[, c('LCL', 'UCL', 'Rate', 'std_error', 'RSE')] =
    basic_compute(result3$Numerator, result3$Denominator)
  
  result3
  
 }

results3 = test_function(demoDF_N, Note, EVENT_YEAR)
head(results3, n = 30)

Thanks for the suggestion. That is a good start. What happens if the numerator and denominator are in different data frames?

Say I have this code

demoDF_D = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDFDN$Denominator = sample(90:120, 162, replace = TRUE)

And then how do I write the by condition in the join? I have to join by the grouping variables.

I made some progress. I added in a second data frame in case the denominator is in a different data frame than the numerator. So the only remaining issue is getting the join condition (by = ) right.

# The sample data
set.seed(2021)

EVENT_YEAR = 2010:2015
RE = c('white', 'black', 'Asian')
City = c('Oakland', 'San Francisco', 'San Jose')
Note = 1:3

# data frame for numerator
demoDF_N = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDF_N$Numerator = sample(3:10, 162, replace = TRUE)
demoDF_N$EVENT_YEAR = as.factor(demoDF_N$EVENT_YEAR)
demoDF_N$RE = as.factor(demoDF_N$RE)
demoDF_N$City = as.factor(demoDF_N$City)
demoDF_N$Note = as.factor(demoDF_N$Note)


# data frame for denominator
demoDF_D = expand.grid(EVENT_YEAR = EVENT_YEAR, RE = RE, City = City, Note = Note)
demoDF_D$Denominator = sample(90:120, 162, replace = TRUE)
func1 = function(df1, df2, groups){ 
  result_3_N = df1 %>%    
    group_by(Note, across({{groups}})) %>%               
    arrange(Note, across({{groups}}), EVENT_YEAR) %>%
    mutate(Numerator_UPDATED = slide_dbl(Numerator, sum, .before = 1, .after = 1, 
                                         .complete = TRUE)) %>%
    select(Note, {{groups}}, EVENT_YEAR, Numerator_UPDATED) %>% 
    ungroup()
  result_3_N = result_3_N %>% rename(Numerator = Numerator_UPDATED) # simple rename
  
  result_3_N = result_3_N %>% filter(!is.na(Numerator))  # filter out rows not based on 3 full years
  
  # Get EVENT_YEAR to display range of years, e.g., 2008-2010
  result_3_N$EVENT_YEAR = 
    paste(as.integer(as.character(result_3_N$EVENT_YEAR)) - 1, '-', 
          as.integer(as.character(result_3_N$EVENT_YEAR)) + 1, sep = '')
  
  ###
  result_3_D = df2 %>%    
    group_by(Note, across({{groups}})) %>%               
    arrange(Note, across({{groups}}), EVENT_YEAR) %>%
    mutate(Denominator_UPDATED = slide_dbl(Denominator, sum, .before = 1, .after = 1, 
                                         .complete = TRUE)) %>%
    select(Note, {{groups}}, EVENT_YEAR, Denominator_UPDATED) %>% 
    ungroup()
  result_3_D = result_3_D %>% rename(Denominator = Denominator_UPDATED) # simple rename
  
  result_3_D = result_3_D %>% filter(!is.na(Denominator))  # filter out rows not based on 3 full years
  
  # Get EVENT_YEAR to display range of years, e.g., 2008-2010
  result_3_D$EVENT_YEAR = 
    paste(as.integer(as.character(result_3_D$EVENT_YEAR)) - 1, '-', 
          as.integer(as.character(result_3_D$EVENT_YEAR)) + 1, sep = '')
  
  # Trying to join the results; by = as.character(groups))  THIS IS WRONG!
  result3 = result_3_N %>% right_join(result_3_D, by = as.character(groups)) %>%
    replace_na(list(Denominator = 0)) %>% replace_na(list(Numerator = 0))
  
  
  result3
}

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.