Elegant solution for cross-column summary function?

Do you know of an elegant solution to the case, where you have a numeric subset of columns in your data and then you want a new column, which holds the column name of the max of the aforementioned subset?

Example:

# Library
library("tidyverse")

# Make some data
set.seed(247350)
d <- tibble(id = str_c("id_", 1:20),
            v1 = rnorm(20),
            v2 = rnorm(20),
            v3 = rnorm(20),
            v4 = rnorm(20),
            v5 = rnorm(20))

# Non-elegant solution 1 - rowwise
d %>%
  select(v1:v5) %>%
  rowwise %>%
  mutate(is_max = colnames(d)[which.max(c(v1, v2, v3, v4, v5))+1]) %>% 
  ungroup

# Non-elegant solution 2 - pivotting
d %>%
  pivot_longer(cols = -id, names_to = "variable", values_to = "value") %>%
  group_by(id) %>%
  mutate(is_max = variable[which.max(value)]) %>% 
  ungroup %>% 
  pivot_wider(names_from = variable, values_from = value)

I"m not sure what you mean by elegant, but I liked this solution:

library(tidyverse)
d <- tibble(id = str_c("id_", 1:20),
            v1 = rnorm(20),
            v2 = rnorm(20),
            v3 = rnorm(20),
            v4 = rnorm(20),
            v5 = rnorm(20))

d %>% 
  pivot_longer(-1) %>% 
  group_by(id) %>% 
  filter(value == max(value)) %>% 
  inner_join(d)
#> Joining, by = "id"
#> # A tibble: 20 x 8
#> # Groups:   id [20]
#>    id    name   value      v1      v2      v3      v4      v5
#>    <chr> <chr>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#>  1 id_1  v3     1.09  -1.32   -0.324   1.09    0.647  -0.365 
#>  2 id_2  v5     0.705 -2.11   -2.59   -1.48    0.475   0.705 
#>  3 id_3  v4     0.821  0.649  -2.19   -0.715   0.821  -0.703 
#>  4 id_4  v2    -0.560 -2.05   -0.560  -1.20   -1.20   -0.690 
#>  5 id_5  v5     0.841 -0.0216 -0.696  -0.978   0.455   0.841 
#>  6 id_6  v4     1.54  -0.371   0.146   0.711   1.54   -0.339 
#>  7 id_7  v1     2.39   2.39    0.773   1.01   -0.305  -0.569 
#>  8 id_8  v1     1.42   1.42   -2.15    1.05   -0.108   0.617 
#>  9 id_9  v2    -0.138 -0.835  -0.138  -0.438  -0.831  -0.200 
#> 10 id_10 v3     1.84  -0.617   0.264   1.84    0.0989  0.993 
#> 11 id_11 v2     2.21   0.431   2.21    1.01    0.739  -0.259 
#> 12 id_12 v3     1.73  -1.44    0.358   1.73   -0.958  -1.89  
#> 13 id_13 v5     1.04   0.630  -0.502  -0.0405 -0.360   1.04  
#> 14 id_14 v1     0.325  0.325  -0.994  -2.66   -0.207  -0.817 
#> 15 id_15 v1     1.90   1.90   -0.331   0.422   1.19   -0.366 
#> 16 id_16 v2     1.88   0.647   1.88    0.798   1.41   -0.165 
#> 17 id_17 v4     0.651 -0.987  -0.0923 -0.988   0.651   0.0388
#> 18 id_18 v4     0.731  0.616  -0.348   0.374   0.731   0.481 
#> 19 id_19 v1     1.56   1.56    0.182   0.0765 -0.715  -0.497 
#> 20 id_20 v1     1.52   1.52   -0.172  -0.845  -1.10   -0.144

Created on 2020-03-06 by the reprex package (v0.3.0)

2 Likes

How about this:

d %>% 
  mutate(is_max = apply(.[-1], 1, function(x) names(x)[which.max(x)]))
Click to see output
  id         v1     v2      v3      v4      v5 is_max
   <chr>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <chr> 
 1 id_1   0.0151 -1.10  -1.28   -0.378  -1.22   v1    
 2 id_2   1.31   -0.586 -1.49   -1.31   -0.687  v1    
 3 id_3   1.74    0.106  2.03   -1.06    2.44   v5    
 4 id_4   0.449  -1.39  -0.175  -1.02    0.214  v1    
 5 id_5  -0.0120 -2.83   1.32    0.723  -1.10   v3    
 6 id_6  -0.472   0.435  1.93    0.0546  0.176  v3    
 7 id_7   1.85   -0.860  0.492  -0.493  -0.465  v1    
 8 id_8  -0.979   1.15  -0.0938  0.596   2.39   v5    
 9 id_9   0.354   0.373 -0.388   0.728  -0.218  v4    
10 id_10  0.0955 -1.70  -0.208   0.866  -0.0983 v4    
11 id_11  0.518   0.175  0.685   0.973  -0.265  v4    
12 id_12 -1.60    0.200 -0.347  -0.670  -1.91   v2    
13 id_13  1.22    1.11   0.302   0.0744  1.12   v1    
14 id_14  2.28    1.15   0.353   0.352   0.431  v1    
15 id_15 -0.609   1.02   0.611   0.0351  0.846  v2    
16 id_16  0.336   0.543 -0.333  -0.386   0.769  v5    
17 id_17  0.971  -0.923  0.973   0.960   0.439  v3    
18 id_18 -1.27    0.554  1.11    0.702   0.198  v3    
19 id_19 -1.25    0.642 -0.0938 -1.59    0.643  v5    
20 id_20  1.55    0.542  1.36   -0.398   1.42   v1 

Inspired by @joels's solution to try to translate it into a purrr version :slight_smile: :

library(tidyverse)
d <- tibble(id = str_c("id_", 1:20),
            v1 = rnorm(20),
            v2 = rnorm(20),
            v3 = rnorm(20),
            v4 = rnorm(20),
            v5 = rnorm(20))

d %>% 
  mutate(
    max_col = 
      d %>% select(-1) %>% 
      pmap(c) %>% map(which.max) %>% map_chr(names)) 

Created on 2020-03-06 by the reprex package (v0.3.0)

Output
#> # A tibble: 20 x 7
#>    id         v1     v2      v3      v4        v5 max_col
#>    <chr>   <dbl>  <dbl>   <dbl>   <dbl>     <dbl> <chr>  
#>  1 id_1  -0.261  -0.905 -0.391  -0.742  -0.000760 v5     
#>  2 id_2  -1.12   -0.640  1.18   -1.78   -1.94     v3     
#>  3 id_3   0.444  -0.349  0.454  -0.424  -0.750    v3     
#>  4 id_4   1.47   -0.307 -0.631   1.27    0.496    v1     
#>  5 id_5  -1.38   -0.548 -0.745  -0.183   0.922    v5     
#>  6 id_6  -1.73   -0.820 -0.216  -0.754  -0.324    v3     
#>  7 id_7   1.31   -1.45  -0.0633 -1.74   -0.984    v1     
#>  8 id_8  -0.165  -1.14  -1.26    0.213  -0.367    v4     
#>  9 id_9  -0.230  -1.69  -1.77   -0.0239  1.88     v5     
#> 10 id_10 -0.240   0.251 -0.153  -0.501   0.598    v5     
#> 11 id_11  0.0615  0.938 -2.17    2.06   -0.341    v4     
#> 12 id_12 -1.24   -1.06   0.0820 -1.45    0.219    v5     
#> 13 id_13  1.56   -1.68   0.645  -0.813  -0.228    v1     
#> 14 id_14 -1.32   -0.668 -1.03    0.446  -0.257    v4     
#> 15 id_15  0.450   1.21  -1.13   -0.162   2.12     v5     
#> 16 id_16 -0.0846 -1.75   1.12   -0.955   0.996    v3     
#> 17 id_17  1.61    0.859  0.160   0.248   0.463    v1     
#> 18 id_18  0.420  -0.174  0.856   1.06    1.15     v5     
#> 19 id_19  0.202   0.213  0.792  -0.409  -0.516    v3     
#> 20 id_20  1.90    0.457 -0.692  -0.753   0.304    v1

This, I find to be the more elegant solution - Thanks for input :+1:

1 Like

Here another solution with the new across funtion (currently dev branch)

library(dplyr)

max_col <- function(df, ties.method = "random"){
  names <- names(df)
  names[max.col(df, ties.method = ties.method)]
}

df <- tibble(a = c(1, 0, 0),
             b = c(0, 1, 1),
             c = c(0, 0, NA),
             d = c("e", "f", "g"))

# Old
df %>% mutate(max = max_col(df %>% select_if(is.numeric)))
#> # A tibble: 3 x 5
#>       a     b     c d     max  
#>   <dbl> <dbl> <dbl> <chr> <chr>
#> 1     1     0     0 e     a    
#> 2     0     1     0 f     b    
#> 3     0     1    NA g     <NA>
# New
df %>% mutate(max = max_col(across(is.numeric)))
#> # A tibble: 3 x 5
#>       a     b     c d     max  
#>   <dbl> <dbl> <dbl> <chr> <chr>
#> 1     1     0     0 e     a    
#> 2     0     1     0 f     b    
#> 3     0     1    NA g     <NA>
> #
> # Just a and b
> df %>% mutate(max = max_col(across(c(a, b))))
# A tibble: 3 x 5
      a     b     c d     max  
  <dbl> <dbl> <dbl> <chr> <chr>
1     1     0     0 e     a    
2     0     1     0 f     b    
3     0     1    NA g     b    
1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.