Multiple outputs from one line of dplyr::mutate?

Hi,

I have a question about the new version of dplyr. In the new version of dplyr::summarize(), one can create multiple columns at once. Is that possible through the new dplyr::mutate() too?

For example, lets say I had a table with sales by day:

library(dplyr, warn.conflicts = FALSE)

# this next line creates the table
df1 <- tibble::tribble(
  ~day, ~sales,
  1L,   260L,
  2L,   230L,
  3L,   110L,
  4L,   420L,
  5L,   420L,
  6L,   380L,
  7L,   360L,
  8L,   860L,
  9L,   300L,
  10L,   240L
)

# The next few lines show the table we created earlier
df1
#> # A tibble: 10 x 2
#>      day sales
#>    <int> <int>
#>  1     1   260
#>  2     2   230
#>  3     3   110
#>  4     4   420
#>  5     5   420
#>  6     6   380
#>  7     7   360
#>  8     8   860
#>  9     9   300
#> 10    10   240

Let's say I want to add columns for the difference in sales since yesterday, the day before, and the day before that.

I could do so with three lines in mutate().

df1 %>% mutate(
  # this line creates "lag1" 
  # the lag function looks 'n' rows above, so I run it 
  #   one time for each row
  sales_chg_d1 = sales - lag(sales, n = 1),
  sales_chg_d2 = sales - lag(sales, n = 2),
  sales_chg_d3 = sales - lag(sales, n = 3))
#> # A tibble: 10 x 5
#>      day sales sales_chg_d1 sales_chg_d2 sales_chg_d3
#>    <int> <int>        <int>        <int>        <int>
#>  1     1   260           NA           NA           NA
#>  2     2   230          -30           NA           NA
#>  3     3   110         -120         -150           NA
#>  4     4   420          310          190          160
#>  5     5   420            0          310          190
#>  6     6   380          -40          -40          270
#>  7     7   360          -20          -60          -60
#>  8     8   860          500          480          440
#>  9     9   300         -560          -60          -80
#> 10    10   240          -60         -620         -120

Created on 2020-07-07 by the reprex package (v0.3.0)

Is there any way to run this on three days at once, in one line of mutate? That way, I could run it easily for all 10 days, or something like that. I tried playing around with across(), but that doesn't seem to work.

Thank you!

I'm still on dplyr 0.85 so haven't tried to learn across yet.
Currently I would programmatically create the variables like so

library(tidyverse)

df1 <- tibble::tribble(
  ~day, ~sales,
  1L,   260L,
  2L,   230L,
  3L,   110L,
  4L,   420L,
  5L,   420L,
  6L,   380L,
  7L,   360L,
  8L,   860L,
  9L,   300L,
  10L,   240L
)



new_sales_3 <- map_dfc(1:3,
    ~mutate(df1,
               !!sym(paste0("sales_",.)) := sales-dplyr::lag(sales,n=.)) %>% 
               select(-day,-sales)
    )

df2 <- bind_cols(df1,
                 new_sales_3)

# A tibble: 10 x 5
# day sales sales_1 sales_2 sales_3
# <int> <int>   <int>   <int>   <int>
# 1     1   260      NA      NA      NA
# 2     2   230     -30      NA      NA
# 3     3   110    -120    -150      NA
# 4     4   420     310     190     160
# 5     5   420       0     310     190
# 6     6   380     -40     -40     270
# 7     7   360     -20     -60     -60
# 8     8   860     500     480     440
# 9     9   300    -560     -60     -80
# 10    10   240     -60    -620    -120
1 Like

EDIT: Fixed a bug in my code.

Here's how I'd do it using across().

library(dplyr, warn.conflicts = FALSE)

df1 <- tibble::tribble(
  ~day, ~sales,
  1L, 260L,
  2L, 230L,
  3L, 110L,
  4L, 420L,
  5L, 420L,
  6L, 380L,
  7L, 360L,
  8L, 860L,
  9L, 300L,
  10L, 240L
)

fns_list <- list(
  chg_d1 = ~ .x - lag(.x, 1L),
  chg_d2 = ~ .x - lag(.x, 2L),
  chg_d3 = ~ .x - lag(.x, 3L)
)

mutate(df1, across(.cols = sales, .fns = fns_list, .names = "{col}_{fn}"))
#> # A tibble: 10 x 5
#>      day sales sales_chg_d1 sales_chg_d2 sales_chg_d3
#>    <int> <int>        <int>        <int>        <int>
#>  1     1   260           NA           NA           NA
#>  2     2   230          -30           NA           NA
#>  3     3   110         -120         -150           NA
#>  4     4   420          310          190          160
#>  5     5   420            0          310          190
#>  6     6   380          -40          -40          270
#>  7     7   360          -20          -60          -60
#>  8     8   860          500          480          440
#>  9     9   300         -560          -60          -80
#> 10    10   240          -60         -620         -120

Created on 2020-07-07 by the reprex package (v0.3.0)

My approach isn't as scalable as the one proposed by nirgrahamuk since you need to pre-define the list of functions to apply but it's certainly elegant.

It could be made scalable by writing a function to generate the contents of fns_list on-the-fly but I haven't attempted it.

1 Like

Hi @AJF,

Take a look at this:

# Load libraries ----------------------------------------------------------
library("tidyverse")


# Define example data -----------------------------------------------------
df1 <- tribble(
  ~day, ~sales,
  1,     260,
  2,     230,
  3,     110,
  4,     420,
  5,     420,
  6,     380,
  7,     360,
  8,     860,
  9,     300,
  10,    240
)


# Wrangle data ------------------------------------------------------------
df1 <- sapply(1:nrow(df1), function(i){
  df1 <<- mutate(df1, !!str_c("sales_chg_d", i) := sales - lag(sales, n = i))
})[[nrow(df1)]]

Yielding:

> df1
# A tibble: 10 x 12
     day sales sales_chg_d1 sales_chg_d2 sales_chg_d3 sales_chg_d4 sales_chg_d5 sales_chg_d6
   <dbl> <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
 1     1   260           NA           NA           NA           NA           NA           NA
 2     2   230          -30           NA           NA           NA           NA           NA
 3     3   110         -120         -150           NA           NA           NA           NA
 4     4   420          310          190          160           NA           NA           NA
 5     5   420            0          310          190          160           NA           NA
 6     6   380          -40          -40          270          150          120           NA
 7     7   360          -20          -60          -60          250          130          100
 8     8   860          500          480          440          440          750          630
 9     9   300         -560          -60          -80         -120         -120          190
10    10   240          -60         -620         -120         -140         -180         -180
# … with 4 more variables: sales_chg_d7 <dbl>, sales_chg_d8 <dbl>, sales_chg_d9 <dbl>,
#   sales_chg_d10 <dbl>

It works, but I'm not completely happy with using sapply, perhaps someone has a more tidy solution?

Hope it helps :slightly_smiling_face:

1 Like

Dear Advanced R Friends,

Would it be possible for you to add some more comments (for intermediate-beginners like me) in order to better understand what are you doing ?
This would be helpful to learn something new instead of staring at your magnificent code and trying figure it out by myself, please.
best.

2 Likes

@Andrzej across() is a new function introduced in dplyr 1.0.0. I'd recommend reading the colwise vignette as my solution is just a straightforward application of the function's features.

Good point! The weird looking sapply at the end of my suggestion is equivalent to:

for( i in 1:nrow(df1) ){
  df1 <- mutate(df1, !!str_c("sales_chg_d", i) := sales - lag(sales, n = i))
}

Which I probably should have used, since it clearer and more concise even if using a for-loop :+1:

I think this would do it

fns_list_gen <- function(parm){
  map(1:parm,
     ~  as.formula(paste0("chg_d",.x," ~ .x - lag(.x, ",.x,"L)"))
  )
}

Close but not quite. That generates a two-sided formula. chg_d1, chg_d2 etc. are the list element names, not the LHS. But your attempt encouraged me to not be lazy and actually give it a shot. :wink:

library(dplyr, warn.conflicts = FALSE)
library(purrr)

fns_list_gen <- function(n_lags) {
  fns_list <- map(1:n_lags, ~ as.formula(paste0(" ~ .x - lag(.x, ", .x, "L)")))
  set_names(fns_list, nm = paste0("chg_d", 1:n_lags))
}

fns_list <- fns_list_gen(n_lags = 4)

df1 <- tibble::tribble(
  ~day, ~sales,
  1L, 260L,
  2L, 230L,
  3L, 110L,
  4L, 420L,
  5L, 420L,
  6L, 380L,
  7L, 360L,
  8L, 860L,
  9L, 300L,
  10L, 240L
)

mutate(df1, across(.cols = sales, .fns = fns_list, .names = "{col}_{fn}"))
#> # A tibble: 10 x 6
#>      day sales sales_chg_d1 sales_chg_d2 sales_chg_d3 sales_chg_d4
#>    <int> <int>        <int>        <int>        <int>        <int>
#>  1     1   260           NA           NA           NA           NA
#>  2     2   230          -30           NA           NA           NA
#>  3     3   110         -120         -150           NA           NA
#>  4     4   420          310          190          160           NA
#>  5     5   420            0          310          190          160
#>  6     6   380          -40          -40          270          150
#>  7     7   360          -20          -60          -60          250
#>  8     8   860          500          480          440          440
#>  9     9   300         -560          -60          -80         -120
#> 10    10   240          -60         -620         -120         -140

Created on 2020-07-08 by the reprex package (v0.3.0)

2 Likes

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.