get rowmax of column, where columnnames are passed as a variable

sothy · September 10, 2019, 12:55pm

Hi everyone,

Can't get my mind around to solve the following problem:

I have a dataframe, where I want to extract the row-maximum of a selection of columns and create an new column.
The columnnames of the columns to be selected and the name of the new_column are stored as a variable.

# input dataframe
df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
                  "V2" = c(2, 2, 5, 4, 366),
                  "V3" = c(10, 20, 10, 20, 10),
                  "V4" = c(2, 10, 31, 2, 2),
                  "V5" = c(4, 10, 31, 2, 2),
                  "NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
                  "COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))


# dataframe with desired output
df_data_output <- tibble("V1" = c(1, 2, 5, 4, 366),
                  "V2" = c(2, 2, 5, 4, 366),
                  "V3" = c(10, 20, 10, 20, 10),
                  "V4" = c(2, 10, 31, 2, 2),
                  "V5" = c(4, 10, 31, 2, 2),
                  "NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
                  "COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"),
                  "max_P1" = c(2, 2, NA, NA, NA),
                  "max_P2" = c(NA, NA, 31, 20, NA),
                  "max_P3" = c(NA, NA, NA, NA, 366))

My unsuccessfull approach was to solve this problem with map as follows (I was simply trying to reduce the problem to select and get the sum, before tackling the actual problem):

df_data$COLS_to_SELECT %>% as.list() %>% 
  map(~ df_data %>% select(!!.x) %>% rowsum(.))

Any suggestions are welcome.

--
Greetings from Switzerland, Sothy.

Yarnabrina · September 10, 2019, 4:36pm

I don't think your expected output is correct. Why the last element of max_P3 is 366, and not 10?

If that's a typo, here's a working but ugly (and almost surely non-generalisable) solution:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
                  "V2" = c(2, 2, 5, 4, 366),
                  "V3" = c(10, 20, 10, 20, 10),
                  "V4" = c(2, 10, 31, 2, 2),
                  "V5" = c(4, 10, 31, 2, 2),
                  "NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
                  "COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))

df_data %>%
  group_split(COLS_to_SELECT) %>%
  map(.f = ~ .x %>%
        mutate(!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
                                                             args = .x[unlist(x = strsplit(x = COLS_to_SELECT,
                                                                                           split = ", ")[[1]])]))) %>%
  bind_rows() %>%
  as.data.frame() # just to print all columns
#>    V1  V2 V3 V4 V5 NAME_of_NEWCOLUMN COLS_to_SELECT max_P1 max_P2 max_P3
#> 1   1   2 10  2  4            max_P1         V1, V2      2     NA     NA
#> 2   2   2 20 10 10            max_P1         V1, V2      2     NA     NA
#> 3   5   5 10 31 31            max_P2     V3, V4, V5     NA     31     NA
#> 4   4   4 20  2  2            max_P2     V3, V4, V5     NA     20     NA
#> 5 366 366 10  2  2            max_P3         V3, V5     NA     NA     10

^{Created on 2019-09-10 by the reprex package (v0.3.0)}

I'm looking forward to elegant solutions to this question.

Edit

@sothy, adding a small modification to use map_dfr (not that it makes it much elegant, but it's little faster and takes less memory):

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
                  "V2" = c(2, 2, 5, 4, 366),
                  "V3" = c(10, 20, 10, 20, 10),
                  "V4" = c(2, 10, 31, 2, 2),
                  "V5" = c(4, 10, 31, 2, 2),
                  "NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
                  "COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))

df_data %>%
  group_split(COLS_to_SELECT) %>%
  map_dfr(.f = ~ mutate(.data = .x,
                        !!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
                                                                      args = .x[strsplit(x = COLS_to_SELECT,
                                                                                         split = ", ")[[1]]])))
#> # A tibble: 5 x 10
#>      V1    V2    V3    V4    V5 NAME_of_NEWCOLU… COLS_to_SELECT max_P1
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>            <chr>           <dbl>
#> 1     1     2    10     2     4 max_P1           V1, V2              2
#> 2     2     2    20    10    10 max_P1           V1, V2              2
#> 3     5     5    10    31    31 max_P2           V3, V4, V5         NA
#> 4     4     4    20     2     2 max_P2           V3, V4, V5         NA
#> 5   366   366    10     2     2 max_P3           V3, V5             NA
#> # … with 2 more variables: max_P2 <dbl>, max_P3 <dbl>

^{Created on 2019-09-13 by the reprex package (v0.3.0)}

benchmark

library(bench)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

df_data <- tibble("V1" = c(1, 2, 5, 4, 366),
                  "V2" = c(2, 2, 5, 4, 366),
                  "V3" = c(10, 20, 10, 20, 10),
                  "V4" = c(2, 10, 31, 2, 2),
                  "V5" = c(4, 10, 31, 2, 2),
                  "NAME_of_NEWCOLUMN" = c("max_P1", "max_P1", "max_P2", "max_P2", "max_P3"),
                  "COLS_to_SELECT" = c("V1, V2", "V1, V2", "V3, V4, V5", "V3, V4, V5", "V3, V5"))

mark(soln1 =
       {
         df_data %>%
           group_split(COLS_to_SELECT) %>%
           map(.f = ~ .x %>%
                 mutate(!!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
                                                                      args = .x[strsplit(x = COLS_to_SELECT,
                                                                                         split = ", ")[[1]]]))) %>%
           bind_rows()
       },
     soln2 =
       {
         df_data %>%
           group_split(COLS_to_SELECT) %>%
           map_dfr(.f = ~ mutate(.data = .x,
                                 !!unique(x = .x$NAME_of_NEWCOLUMN) := do.call(what = pmax,
                                                                               args = .x[strsplit(x = COLS_to_SELECT,
                                                                                                  split = ", ")[[1]]])))
       },
     iterations = 10000,
     check = TRUE)
#> # A tibble: 2 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 soln1         997µs   1.06ms      915.   668.2KB     16.9
#> 2 soln2         829µs 877.75µs     1103.    31.7KB     16.5

^{Created on 2019-09-13 by the reprex package (v0.3.0)}

sothy · September 11, 2019, 8:43am

Hi Anirban,

Thank you for your solution, it works so far.

But I am also curious, if a more elegant way to solve this problem exists... I am sure, that most data scientists have faced similar problems.

What makes this problem interesting to me is, that solving this in a way, without leaving the dataframe-space (e.g. without splitting and binding) means, that I would be able to dynamically create new features with metainformation about the dataframe stored inside the dataframe (e.g. columnnames to filter are stored as a variable in COLS_to_SELECT).

system · October 2, 2019, 8:50am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.