How to improve a group_by() %>% mutate_at() workflow that includes a condition?

DavoWW · February 24, 2019, 7:19am

I have a workflow that involves using mutate_at() on a grouped tibble, where the mutate_at() contains an if() condition to generate NA values in multiple columns.
The condition tests whether a certain factor level in one column is present or not.
In this reprex the tibble 'df3' contains my desired output, but to get it to work I have had to use two steps.
It works with column indices (rather than names) but not completely - I still have to specify column 'dd' in the second step.
So, is it possible to do it in one step, and would using if_else() be any help?

Many thanks for any suggestions.

library(tidyverse)
set.seed(1984)
df <- tibble(bb=factor(rep(c(1,2,3,4), each=5)),
             cc=factor(c(rep(c("s1","s2","s3","s3","s3","s1","s3","s3","s4","s5"), times=2))),
             dd=c(seq(11,30,1)),
             ee=rnorm(n=20, mean=20, sd=4),
             ff=rnorm(n=20, mean=50, sd=5))
df
#> # A tibble: 20 x 5
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       16  24.8  51.2
#>  7 2     s3       17  22.2  50.5
#>  8 2     s3       18  16.7  54.1
#>  9 2     s4       19  17.9  47.1
#> 10 2     s5       20  25.7  55.0
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       26  26.9  45.7
#> 17 4     s3       27  20.5  62.5
#> 18 4     s3       28  12.9  48.6
#> 19 4     s4       29  20.5  48.3
#> 20 4     s5       30  13.1  51.2
str(df)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    20 obs. of  5 variables:
#>  $ bb: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 2 2 2 2 2 ...
#>  $ cc: Factor w/ 5 levels "s1","s2","s3",..: 1 2 3 3 3 1 3 3 4 5 ...
#>  $ dd: num  11 12 13 14 15 16 17 18 19 20 ...
#>  $ ee: num  21.6 18.7 22.5 12.6 23.8 ...
#>  $ ff: num  56.2 52.1 46.8 54.5 49.9 ...

# mutate one column first
df %>%
  group_by(bb) %>%
  mutate_at(vars(dd),
            list(~if(length(cc[cc=="s2"]) < 1) {NA} else {.})) -> df2
df2
#> # A tibble: 20 x 5
#> # Groups:   bb [4]
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       NA  24.8  51.2
#>  7 2     s3       NA  22.2  50.5
#>  8 2     s3       NA  16.7  54.1
#>  9 2     s4       NA  17.9  47.1
#> 10 2     s5       NA  25.7  55.0
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       NA  26.9  45.7
#> 17 4     s3       NA  20.5  62.5
#> 18 4     s3       NA  12.9  48.6
#> 19 4     s4       NA  20.5  48.3
#> 20 4     s5       NA  13.1  51.2

# mutate the other RHS columns
df2 %>%
  group_by(bb) %>%
  mutate_at(vars(ee,ff),
            list(~if(is.na(dd)[1]) {NA} else {.})) -> df3
df3
#> # A tibble: 20 x 5
#> # Groups:   bb [4]
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       NA  NA    NA  
#>  7 2     s3       NA  NA    NA  
#>  8 2     s3       NA  NA    NA  
#>  9 2     s4       NA  NA    NA  
#> 10 2     s5       NA  NA    NA  
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       NA  NA    NA  
#> 17 4     s3       NA  NA    NA  
#> 18 4     s3       NA  NA    NA  
#> 19 4     s4       NA  NA    NA  
#> 20 4     s5       NA  NA    NA

# Does this work using column indices?
df %>%
  group_by(bb) %>%
  mutate_at(vars(3),
            list(~if(length(cc[cc=="s2"]) < 1) {NA} else {.})) -> df2
df2
#> # A tibble: 20 x 5
#> # Groups:   bb [4]
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       NA  24.8  51.2
#>  7 2     s3       NA  22.2  50.5
#>  8 2     s3       NA  16.7  54.1
#>  9 2     s4       NA  17.9  47.1
#> 10 2     s5       NA  25.7  55.0
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       NA  26.9  45.7
#> 17 4     s3       NA  20.5  62.5
#> 18 4     s3       NA  12.9  48.6
#> 19 4     s4       NA  20.5  48.3
#> 20 4     s5       NA  13.1  51.2

# mutate the extra columns by index
df2 %>%
  group_by(bb) %>%
  mutate_at(vars(4,5),
            list(~if(is.na(dd)[1]) {NA} else {.})) -> df3

df3
#> # A tibble: 20 x 5
#> # Groups:   bb [4]
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       NA  NA    NA  
#>  7 2     s3       NA  NA    NA  
#>  8 2     s3       NA  NA    NA  
#>  9 2     s4       NA  NA    NA  
#> 10 2     s5       NA  NA    NA  
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       NA  NA    NA  
#> 17 4     s3       NA  NA    NA  
#> 18 4     s3       NA  NA    NA  
#> 19 4     s4       NA  NA    NA  
#> 20 4     s5       NA  NA    NA

^{Created on 2019-02-24 by the reprex package (v0.2.1)}

cderv · February 24, 2019, 10:24am

Using mutate_at with a case_when can get you what you want I think

library(tidyverse)
#> Warning: le package 'tibble' a été compilé avec la version R 3.5.2
#> Warning: le package 'purrr' a été compilé avec la version R 3.5.2
#> Warning: le package 'stringr' a été compilé avec la version R 3.5.2
set.seed(1984)
df <- tibble(bb=factor(rep(c(1,2,3,4), each=5)),
             cc=factor(c(rep(c("s1","s2","s3","s3","s3","s1","s3","s3","s4","s5"), times=2))),
             dd=c(seq(11,30,1)),
             ee=rnorm(n=20, mean=20, sd=4),
             ff=rnorm(n=20, mean=50, sd=5))

df %>%
  group_by(bb) %>%
  mutate_at(vars(dd, ee, ff),
            ~ case_when(any(cc == "s2") ~ .))
#> # A tibble: 20 x 5
#> # Groups:   bb [4]
#>    bb    cc       dd    ee    ff
#>    <fct> <fct> <dbl> <dbl> <dbl>
#>  1 1     s1       11  21.6  56.2
#>  2 1     s2       12  18.7  52.1
#>  3 1     s3       13  22.5  46.8
#>  4 1     s3       14  12.6  54.5
#>  5 1     s3       15  23.8  49.9
#>  6 2     s1       NA  NA    NA  
#>  7 2     s3       NA  NA    NA  
#>  8 2     s3       NA  NA    NA  
#>  9 2     s4       NA  NA    NA  
#> 10 2     s5       NA  NA    NA  
#> 11 3     s1       21  21.1  44.5
#> 12 3     s2       22  21.2  42.6
#> 13 3     s3       23  23.3  53.0
#> 14 3     s3       24  24.1  54.8
#> 15 3     s3       25  20.1  51.5
#> 16 4     s1       NA  NA    NA  
#> 17 4     s3       NA  NA    NA  
#> 18 4     s3       NA  NA    NA  
#> 19 4     s4       NA  NA    NA  
#> 20 4     s5       NA  NA    NA

^{Created on 2019-02-24 by the reprex package (v0.2.1)}

You need last dplyr version for the mutate_at without funs I think.

DavoWW · February 26, 2019, 2:32am

Thanks for the solution Christophe - very nice!

It took me a while to work out that there is some "short-hand" in your solution, since this is not obvious (to me) in the ?case_when help.
The "full" code would be (using the latest version of dplyr):

df %>%
  group_by(bb) %>%
  mutate_at(vars(dd, ee, ff),
            list(~ case_when(
                    any(cc == "s2") ~ .,
                    FALSE ~ NA_real_)))

The reverse logic also works:

df %>%
  group_by(bb) %>%
  mutate_at(vars(c(3:5)),
            ~ case_when(
              !any(cc == "s2") ~ NA_real_,
              TRUE ~ .))

But this non-default approach will fail if the result columns are of mixed type. Its great that the defaults detect this and insert the appropriate NAs:

set.seed(1984)
df_2 <- tibble(bb=factor(rep(c(1,2,3,4), each=5)),
             cc=factor(c(rep(c("s1","s2","s3","s3","s3","s1","s3","s3","s4","s5"), times=2))),
             dd=c(seq(11,30,1)),
             ee=rnorm(n=20, mean=20, sd=4),
             ff=factor(rep(c("pqr", "xyz"), times=10)))   # Changed column type
df_2
str(df_2)

df_2 %>%
  group_by(bb) %>%
  mutate_at(vars(dd, ee, ff),
            ~ case_when(any(cc == "s2") ~ .))

Thanks again.

system · March 5, 2019, 2:32am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.