I don't think your expected output is correct. Why the last element of max_P3
is 366
, and not 10
?
If that's a typo, here's a working but ugly (and almost surely non-generalisable) solution:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
"V2" = c(2, 2, 5, 4, 366),
"V3" = c(10, 20, 10, 20, 10),
"V4" = c(2, 10, 31, 2, 2),
"V5" = c(4, 10, 31, 2, 2),
"NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
"COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))
df_data %>%
group_split(COLS_to_SELECT) %>%
map(.f = ~ .x %>%
mutate(!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
args = .x[unlist(x = strsplit(x = COLS_to_SELECT,
split = ", ")[[1]])]))) %>%
bind_rows() %>%
as.data.frame() # just to print all columns
#> V1 V2 V3 V4 V5 NAME_of_NEWCOLUMN COLS_to_SELECT max_P1 max_P2 max_P3
#> 1 1 2 10 2 4 max_P1 V1, V2 2 NA NA
#> 2 2 2 20 10 10 max_P1 V1, V2 2 NA NA
#> 3 5 5 10 31 31 max_P2 V3, V4, V5 NA 31 NA
#> 4 4 4 20 2 2 max_P2 V3, V4, V5 NA 20 NA
#> 5 366 366 10 2 2 max_P3 V3, V5 NA NA 10
Created on 2019-09-10 by the reprex package (v0.3.0)
I'm looking forward to elegant solutions to this question.
Edit
@sothy, adding a small modification to use map_dfr
(not that it makes it much elegant, but it's little faster and takes less memory):
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
"V2" = c(2, 2, 5, 4, 366),
"V3" = c(10, 20, 10, 20, 10),
"V4" = c(2, 10, 31, 2, 2),
"V5" = c(4, 10, 31, 2, 2),
"NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
"COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))
df_data %>%
group_split(COLS_to_SELECT) %>%
map_dfr(.f = ~ mutate(.data = .x,
!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
args = .x[strsplit(x = COLS_to_SELECT,
split = ", ")[[1]]])))
#> # A tibble: 5 x 10
#> V1 V2 V3 V4 V5 NAME_of_NEWCOLU… COLS_to_SELECT max_P1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
#> 1 1 2 10 2 4 max_P1 V1, V2 2
#> 2 2 2 20 10 10 max_P1 V1, V2 2
#> 3 5 5 10 31 31 max_P2 V3, V4, V5 NA
#> 4 4 4 20 2 2 max_P2 V3, V4, V5 NA
#> 5 366 366 10 2 2 max_P3 V3, V5 NA
#> # … with 2 more variables: max_P2 <dbl>, max_P3 <dbl>
Created on 2019-09-13 by the reprex package (v0.3.0)
benchmark
library(bench)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
"V2" = c(2, 2, 5, 4, 366),
"V3" = c(10, 20, 10, 20, 10),
"V4" = c(2, 10, 31, 2, 2),
"V5" = c(4, 10, 31, 2, 2),
"NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
"COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))
mark(soln1 =
{
df_data %>%
group_split(COLS_to_SELECT) %>%
map(.f = ~ .x %>%
mutate(!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
args = .x[strsplit(x = COLS_to_SELECT,
split = ", ")[[1]]]))) %>%
bind_rows()
},
soln2 =
{
df_data %>%
group_split(COLS_to_SELECT) %>%
map_dfr(.f = ~ mutate(.data = .x,
!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
args = .x[strsplit(x = COLS_to_SELECT,
split = ", ")[[1]]])))
},
iterations = 10000,
check = TRUE)
#> # A tibble: 2 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 soln1 997µs 1.06ms 915. 668.2KB 16.9
#> 2 soln2 829µs 877.75µs 1103. 31.7KB 16.5
Created on 2019-09-13 by the reprex package (v0.3.0)