Leon
March 6, 2020, 9:11pm
1
Do you know of an elegant solution to the case, where you have a numeric subset of columns in your data and then you want a new column, which holds the column name of the max of the aforementioned subset?
Example:
# Library
library("tidyverse")
# Make some data
set.seed(247350)
d <- tibble(id = str_c("id_", 1:20),
v1 = rnorm(20),
v2 = rnorm(20),
v3 = rnorm(20),
v4 = rnorm(20),
v5 = rnorm(20))
# Non-elegant solution 1 - rowwise
d %>%
select(v1:v5) %>%
rowwise %>%
mutate(is_max = colnames(d)[which.max(c(v1, v2, v3, v4, v5))+1]) %>%
ungroup
# Non-elegant solution 2 - pivotting
d %>%
pivot_longer(cols = -id, names_to = "variable", values_to = "value") %>%
group_by(id) %>%
mutate(is_max = variable[which.max(value)]) %>%
ungroup %>%
pivot_wider(names_from = variable, values_from = value)
I"m not sure what you mean by elegant, but I liked this solution:
library(tidyverse)
d <- tibble(id = str_c("id_", 1:20),
v1 = rnorm(20),
v2 = rnorm(20),
v3 = rnorm(20),
v4 = rnorm(20),
v5 = rnorm(20))
d %>%
pivot_longer(-1) %>%
group_by(id) %>%
filter(value == max(value)) %>%
inner_join(d)
#> Joining, by = "id"
#> # A tibble: 20 x 8
#> # Groups: id [20]
#> id name value v1 v2 v3 v4 v5
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id_1 v3 1.09 -1.32 -0.324 1.09 0.647 -0.365
#> 2 id_2 v5 0.705 -2.11 -2.59 -1.48 0.475 0.705
#> 3 id_3 v4 0.821 0.649 -2.19 -0.715 0.821 -0.703
#> 4 id_4 v2 -0.560 -2.05 -0.560 -1.20 -1.20 -0.690
#> 5 id_5 v5 0.841 -0.0216 -0.696 -0.978 0.455 0.841
#> 6 id_6 v4 1.54 -0.371 0.146 0.711 1.54 -0.339
#> 7 id_7 v1 2.39 2.39 0.773 1.01 -0.305 -0.569
#> 8 id_8 v1 1.42 1.42 -2.15 1.05 -0.108 0.617
#> 9 id_9 v2 -0.138 -0.835 -0.138 -0.438 -0.831 -0.200
#> 10 id_10 v3 1.84 -0.617 0.264 1.84 0.0989 0.993
#> 11 id_11 v2 2.21 0.431 2.21 1.01 0.739 -0.259
#> 12 id_12 v3 1.73 -1.44 0.358 1.73 -0.958 -1.89
#> 13 id_13 v5 1.04 0.630 -0.502 -0.0405 -0.360 1.04
#> 14 id_14 v1 0.325 0.325 -0.994 -2.66 -0.207 -0.817
#> 15 id_15 v1 1.90 1.90 -0.331 0.422 1.19 -0.366
#> 16 id_16 v2 1.88 0.647 1.88 0.798 1.41 -0.165
#> 17 id_17 v4 0.651 -0.987 -0.0923 -0.988 0.651 0.0388
#> 18 id_18 v4 0.731 0.616 -0.348 0.374 0.731 0.481
#> 19 id_19 v1 1.56 1.56 0.182 0.0765 -0.715 -0.497
#> 20 id_20 v1 1.52 1.52 -0.172 -0.845 -1.10 -0.144
Created on 2020-03-06 by the reprex package (v0.3.0)
2 Likes
joels
March 6, 2020, 10:36pm
3
How about this:
d %>%
mutate(is_max = apply(.[-1], 1, function(x) names(x)[which.max(x)]))
Click to see output
id v1 v2 v3 v4 v5 is_max
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 id_1 0.0151 -1.10 -1.28 -0.378 -1.22 v1
2 id_2 1.31 -0.586 -1.49 -1.31 -0.687 v1
3 id_3 1.74 0.106 2.03 -1.06 2.44 v5
4 id_4 0.449 -1.39 -0.175 -1.02 0.214 v1
5 id_5 -0.0120 -2.83 1.32 0.723 -1.10 v3
6 id_6 -0.472 0.435 1.93 0.0546 0.176 v3
7 id_7 1.85 -0.860 0.492 -0.493 -0.465 v1
8 id_8 -0.979 1.15 -0.0938 0.596 2.39 v5
9 id_9 0.354 0.373 -0.388 0.728 -0.218 v4
10 id_10 0.0955 -1.70 -0.208 0.866 -0.0983 v4
11 id_11 0.518 0.175 0.685 0.973 -0.265 v4
12 id_12 -1.60 0.200 -0.347 -0.670 -1.91 v2
13 id_13 1.22 1.11 0.302 0.0744 1.12 v1
14 id_14 2.28 1.15 0.353 0.352 0.431 v1
15 id_15 -0.609 1.02 0.611 0.0351 0.846 v2
16 id_16 0.336 0.543 -0.333 -0.386 0.769 v5
17 id_17 0.971 -0.923 0.973 0.960 0.439 v3
18 id_18 -1.27 0.554 1.11 0.702 0.198 v3
19 id_19 -1.25 0.642 -0.0938 -1.59 0.643 v5
20 id_20 1.55 0.542 1.36 -0.398 1.42 v1
Inspired by @joels 's solution to try to translate it into a purrr
version :
library(tidyverse)
d <- tibble(id = str_c("id_", 1:20),
v1 = rnorm(20),
v2 = rnorm(20),
v3 = rnorm(20),
v4 = rnorm(20),
v5 = rnorm(20))
d %>%
mutate(
max_col =
d %>% select(-1) %>%
pmap(c) %>% map(which.max) %>% map_chr(names))
Created on 2020-03-06 by the reprex package (v0.3.0)
Output
#> # A tibble: 20 x 7
#> id v1 v2 v3 v4 v5 max_col
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 id_1 -0.261 -0.905 -0.391 -0.742 -0.000760 v5
#> 2 id_2 -1.12 -0.640 1.18 -1.78 -1.94 v3
#> 3 id_3 0.444 -0.349 0.454 -0.424 -0.750 v3
#> 4 id_4 1.47 -0.307 -0.631 1.27 0.496 v1
#> 5 id_5 -1.38 -0.548 -0.745 -0.183 0.922 v5
#> 6 id_6 -1.73 -0.820 -0.216 -0.754 -0.324 v3
#> 7 id_7 1.31 -1.45 -0.0633 -1.74 -0.984 v1
#> 8 id_8 -0.165 -1.14 -1.26 0.213 -0.367 v4
#> 9 id_9 -0.230 -1.69 -1.77 -0.0239 1.88 v5
#> 10 id_10 -0.240 0.251 -0.153 -0.501 0.598 v5
#> 11 id_11 0.0615 0.938 -2.17 2.06 -0.341 v4
#> 12 id_12 -1.24 -1.06 0.0820 -1.45 0.219 v5
#> 13 id_13 1.56 -1.68 0.645 -0.813 -0.228 v1
#> 14 id_14 -1.32 -0.668 -1.03 0.446 -0.257 v4
#> 15 id_15 0.450 1.21 -1.13 -0.162 2.12 v5
#> 16 id_16 -0.0846 -1.75 1.12 -0.955 0.996 v3
#> 17 id_17 1.61 0.859 0.160 0.248 0.463 v1
#> 18 id_18 0.420 -0.174 0.856 1.06 1.15 v5
#> 19 id_19 0.202 0.213 0.792 -0.409 -0.516 v3
#> 20 id_20 1.90 0.457 -0.692 -0.753 0.304 v1
Leon
March 7, 2020, 8:53am
5
This, I find to be the more elegant solution - Thanks for input
1 Like
Here another solution with the new across funtion (currently dev branch)
library(dplyr)
max_col <- function(df, ties.method = "random"){
names <- names(df)
names[max.col(df, ties.method = ties.method)]
}
df <- tibble(a = c(1, 0, 0),
b = c(0, 1, 1),
c = c(0, 0, NA),
d = c("e", "f", "g"))
# Old
df %>% mutate(max = max_col(df %>% select_if(is.numeric)))
#> # A tibble: 3 x 5
#> a b c d max
#> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 1 0 0 e a
#> 2 0 1 0 f b
#> 3 0 1 NA g <NA>
# New
df %>% mutate(max = max_col(across(is.numeric)))
#> # A tibble: 3 x 5
#> a b c d max
#> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 1 0 0 e a
#> 2 0 1 0 f b
#> 3 0 1 NA g <NA>
> #
> # Just a and b
> df %>% mutate(max = max_col(across(c(a, b))))
# A tibble: 3 x 5
a b c d max
<dbl> <dbl> <dbl> <chr> <chr>
1 1 0 0 e a
2 0 1 0 f b
3 0 1 NA g b
1 Like
Leon
Closed
March 14, 2020, 2:12pm
7
This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.