Create all possible combinations of a data frame

Yarnabrina · March 23, 2019, 10:25am

Hi!

I've a data frame with n rows, and I want to apply a function on all possible combinations of k rows of this data frame.

If I can create another data frame which has \binom{n}{k} rows corresponding to the all possible row combinations,then I can simply use my function using apply on the rows.

My question is how to create such a data frame.

Based on this answer on SO, I can do this using base R. (provided below)

But I'm trying to learn tidyverse and hence I wonder whether there's a way to do this in tidyverse.

I can find the relevant row numbers from the output of tidyr::crossing(., .) [which creates all possible pairs] and extract only those. But I suppose there's a better way and at least for me, the pattern of the row indices is not that obvious. For k = 2, it's pretty easy. But I fail to find patterns for higher values of k.

Any suggestions will be appreciated.

# example dataset
input <- data.frame(stringsAsFactors = FALSE,
                    s = letters[1:6],
                    C = LETTERS[1:6])
input
#>   s C
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> 6 f F

# for example
k <- 3

# all possible combinations of the row indices of size k
combinations <- combn(x = seq_len(length.out = nrow(x = input)),
                      m = k)

# what I want
expected_output <- data.frame(stringsAsFactors = FALSE,
                              s1 = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b",
                                     "b", "b", "b", "b", "c", "c", "c", "d"),
                              C1 = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B",
                                     "B", "B", "B", "B", "C", "C", "C", "D"),
                              s2 = c("b", "b", "b", "b", "c", "c", "c", "d", "d", "e", "c", "c",
                                     "c", "d", "d", "e", "d", "d", "e", "e"),
                              C2 = c("B", "B", "B", "B", "C", "C", "C", "D", "D", "E", "C", "C",
                                     "C", "D", "D", "E", "D", "D", "E", "E"),
                              s3 = c("c", "d", "e", "f", "d", "e", "f", "e", "f", "f", "d", "e",
                                     "f", "e", "f", "f", "e", "f", "f", "f"),
                              C3 = c("C", "D", "E", "F", "D", "E", "F", "E", "F", "F", "D", "E",
                                     "F", "E", "F", "F", "E", "F", "F", "F"))
expected_output
#>    s1 C1 s2 C2 s3 C3
#> 1   a  A  b  B  c  C
#> 2   a  A  b  B  d  D
#> 3   a  A  b  B  e  E
#> 4   a  A  b  B  f  F
#> 5   a  A  c  C  d  D
#> 6   a  A  c  C  e  E
#> 7   a  A  c  C  f  F
#> 8   a  A  d  D  e  E
#> 9   a  A  d  D  f  F
#> 10  a  A  e  E  f  F
#> 11  b  B  c  C  d  D
#> 12  b  B  c  C  e  E
#> 13  b  B  c  C  f  F
#> 14  b  B  d  D  e  E
#> 15  b  B  d  D  f  F
#> 16  b  B  e  E  f  F
#> 17  c  C  d  D  e  E
#> 18  c  C  d  D  f  F
#> 19  c  C  e  E  f  F
#> 20  d  D  e  E  f  F

# two ways via base R

# I need to reorder the columns
output_1 <- as.data.frame(x = t(x = apply(X = combinations,
                                          MARGIN = 2,
                                          FUN = function(counter)
                                          {
                                            return(unlist(x = input[counter, ]))
                                          })))
output_1
#>    s1 s2 s3 C1 C2 C3
#> 1   a  b  c  A  B  C
#> 2   a  b  d  A  B  D
#> 3   a  b  e  A  B  E
#> 4   a  b  f  A  B  F
#> 5   a  c  d  A  C  D
#> 6   a  c  e  A  C  E
#> 7   a  c  f  A  C  F
#> 8   a  d  e  A  D  E
#> 9   a  d  f  A  D  F
#> 10  a  e  f  A  E  F
#> 11  b  c  d  B  C  D
#> 12  b  c  e  B  C  E
#> 13  b  c  f  B  C  F
#> 14  b  d  e  B  D  E
#> 15  b  d  f  B  D  F
#> 16  b  e  f  B  E  F
#> 17  c  d  e  C  D  E
#> 18  c  d  f  C  D  F
#> 19  c  e  f  C  E  F
#> 20  d  e  f  D  E  F

# I need to rename the columns
output_2 <- as.data.frame(x = t(x = apply(X = combinations,
                                          MARGIN = 2,
                                          FUN = function(counter)
                                          {
                                            rbind(sapply(X = counter,
                                                         FUN = function(t)
                                                         {
                                                           unlist(x = input[t, ])
                                                         }))
                                          })))
output_2
#>    V1 V2 V3 V4 V5 V6
#> 1   a  A  b  B  c  C
#> 2   a  A  b  B  d  D
#> 3   a  A  b  B  e  E
#> 4   a  A  b  B  f  F
#> 5   a  A  c  C  d  D
#> 6   a  A  c  C  e  E
#> 7   a  A  c  C  f  F
#> 8   a  A  d  D  e  E
#> 9   a  A  d  D  f  F
#> 10  a  A  e  E  f  F
#> 11  b  B  c  C  d  D
#> 12  b  B  c  C  e  E
#> 13  b  B  c  C  f  F
#> 14  b  B  d  D  e  E
#> 15  b  B  d  D  f  F
#> 16  b  B  e  E  f  F
#> 17  c  C  d  D  e  E
#> 18  c  C  d  D  f  F
#> 19  c  C  e  E  f  F
#> 20  d  D  e  E  f  F

^{Created on 2019-03-23 by the reprex package (v0.2.1)}

cderv · March 24, 2019, 10:03am

Here is a way using tidyverse. It is not more efficient I guess but it could be easier to read and to clarify the step. Main task is the reshaping to meet you expected output shape.

library(tidyverse)
input <- data.frame(stringsAsFactors = FALSE,
                    s = letters[1:6],
                    C = LETTERS[1:6])

# for example
k <- 3

# col order for the result (depends on k)
res_col_order <- seq_len(k) %>% 
  map(~ paste0(c("s", "C"), .x)) %>% 
  flatten_chr()

# all possible combinations of the row indices of size k
# RESULTS AS A LIST NOT A MATRIX HERE
combinations <- combn(x = seq_len(length.out = nrow(x = input)),
                      m = k, simplify = FALSE)

# map a function on that list of combination
res <- combinations %>%
  # map and row bind the result
  map_dfr(~ {
    input %>%
      # select the rows
      slice(.x) %>%
      # add an id for each selected combination
      rownames_to_column("id") %>% 
      # reshape the data to long format
      gather("letter", "value", -id) %>% 
      # create the column name
      unite(letter, letter, id, sep = "") %>% 
      # reshape to wide format
      spread(letter, value) %>%
      # order the column
      select_at(res_col_order)
  })


# expected output
expected_output <- data.frame(stringsAsFactors = FALSE,
                              s1 = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b",
                                     "b", "b", "b", "b", "c", "c", "c", "d"),
                              C1 = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B",
                                     "B", "B", "B", "B", "C", "C", "C", "D"),
                              s2 = c("b", "b", "b", "b", "c", "c", "c", "d", "d", "e", "c", "c",
                                     "c", "d", "d", "e", "d", "d", "e", "e"),
                              C2 = c("B", "B", "B", "B", "C", "C", "C", "D", "D", "E", "C", "C",
                                     "C", "D", "D", "E", "D", "D", "E", "E"),
                              s3 = c("c", "d", "e", "f", "d", "e", "f", "e", "f", "f", "d", "e",
                                     "f", "e", "f", "f", "e", "f", "f", "f"),
                              C3 = c("C", "D", "E", "F", "D", "E", "F", "E", "F", "F", "D", "E",
                                     "F", "E", "F", "F", "E", "F", "F", "F"))

identical(res, expected_output)
#> [1] TRUE

^{Created on 2019-03-24 by the reprex package (v0.2.1)}

There is also a way with less steps but more cryptic and advanced as it uses the power of purrr::pmap(). Basically, it does the same kind of reshaping but in one way.

library(tidyverse)
input <- data.frame(stringsAsFactors = FALSE,
                    s = letters[1:6],
                    C = LETTERS[1:6])

# for example
k <- 3

# col order for the result (depends on k)
res_col_order <- seq_len(k) %>% 
  map(~ paste0(c("s", "C"), .x)) %>% 
  flatten_chr()

# all possible combinations of the row indices of size k
# RESULTS AS A LIST NOT A MATRIX
combinations <- combn(x = seq_len(length.out = nrow(x = input)),
                      m = k, simplify = FALSE)

# map a function on that list of combination
res <- combinations %>%
  map_dfr(~ {
    input %>%
      # select the rows
      slice(.x) %>%
      # create a tibble from each row and column bind the result
      pmap_dfc(tibble) %>%
      # set the correct names by replacing all names
      set_names(res_col_order)
  })
res
#> # A tibble: 20 x 6
#>    s1    C1    s2    C2    s3    C3   
#>    <chr> <chr> <chr> <chr> <chr> <chr>
#>  1 a     A     b     B     c     C    
#>  2 a     A     b     B     d     D    
#>  3 a     A     b     B     e     E    
#>  4 a     A     b     B     f     F    
#>  5 a     A     c     C     d     D    
#>  6 a     A     c     C     e     E    
#>  7 a     A     c     C     f     F    
#>  8 a     A     d     D     e     E    
#>  9 a     A     d     D     f     F    
#> 10 a     A     e     E     f     F    
#> 11 b     B     c     C     d     D    
#> 12 b     B     c     C     e     E    
#> 13 b     B     c     C     f     F    
#> 14 b     B     d     D     e     E    
#> 15 b     B     d     D     f     F    
#> 16 b     B     e     E     f     F    
#> 17 c     C     d     D     e     E    
#> 18 c     C     d     D     f     F    
#> 19 c     C     e     E     f     F    
#> 20 d     D     e     E     f     F

^{Created on 2019-03-24 by the reprex package (v0.2.1)}

Yarnabrina · March 24, 2019, 1:41pm

Thanks for the response!

I was hoping for a direct approach via tidyverse. But if I have to use combn, then I guess I'll prefer base approach, as I understand what it's doing much better, as of now. Actually, these two methods are slower (see below in reprex), and I won't gain anything other than learning a new alternative (which is important, I understand).

# functions to check the required time

## for my approach
base_timing <- function(input_dataframe,
                        no_groups)
{
  base_time <- system.time(expr = {
    # all possible combinations of the row indices of size no_group
    combinations_matrix <- combn(x = nrow(x = input_dataframe),
                                 m = no_groups)
    
    # I need to rename the columns
    output_base_r <- data.frame(t(x = apply(X = combinations_matrix,
                                            MARGIN = 2,
                                            FUN = function(counter) 
                                            {
                                              rbind(sapply(X = counter,
                                                           FUN = function(t)
                                                           {
                                                             unlist(x = input_dataframe[t, ])
                                                           }))
                                            })),
                                stringsAsFactors = FALSE)
    
    # setting column names
    colnames(x = output_base_r) <- c(do.call(what = rbind,
                                             args = lapply(X = colnames(x = input_dataframe),
                                                           FUN = function(x)
                                                           {
                                                             paste0(x, seq(no_groups))
                                                           })))
  })
  
  return(list(output = output_base_r,
              time = base_time))
}

## for map approach (first one of yours)
map_timing <- function(input_dataframe,
                       no_groups)
{
  # loading tidyverse
  library(tidyverse)
  
  tidy_map_time <- system.time({
    # col order for the result (depends on no of groups)
    res_col_order <- seq_len(length.out = no_groups) %>% 
      map(~ paste0(colnames(x = input_dataframe), .x)) %>% 
      flatten_chr()
    
    # all possible combinations of the row indices of size k
    # RESULTS AS A LIST NOT A MATRIX HERE
    combinations_list <- combn(x = nrow(x = input_dataframe),
                               m = no_groups,
                               simplify = FALSE)
    
    # map a function on that list of combination
    output_tidy_map <- combinations_list %>%
      # map and row bind the result
      map_dfr(~ {
        input_dataframe %>%
          # select the rows
          slice(.x) %>%
          # add an id for each selected combination
          rownames_to_column("id") %>% 
          # reshape the data to long format
          gather("letter", "value", -id) %>% 
          # create the column name
          unite(letter, letter, id, sep = "") %>% 
          # reshape to wide format
          spread(letter, value) %>%
          # order the column
          select_at(res_col_order)
      })
  })
  
  return(list(output = output_tidy_map,
              time = tidy_map_time))
}
  
## for pmap approach (second one of yours)
pmap_timing <- function(input_dataframe,
                        no_groups)
{
  # loading tidyverse
  library(tidyverse)
  
  tidy_pmap_time <- system.time({
    # col order for the result (depends on no of groups)
    res_col_order <- seq_len(length.out = no_groups) %>% 
      map(~ paste0(colnames(x = input_dataframe), .x)) %>% 
      flatten_chr()
    
    # all possible combinations of the row indices of size k
    # RESULTS AS A LIST NOT A MATRIX HERE
    combinations_list <- combn(x = nrow(x = input_dataframe),
                               m = no_groups,
                               simplify = FALSE)
    
    # map a function on that list of combination
    output_tidy_pmap <- combinations_list %>%
      map_dfr(~ {
        input_dataframe %>%
          # select the rows
          slice(.x) %>%
          # create a tibble from each row and column bind the result
          pmap_dfc(tibble) %>%
          # set the correct names by replacing all names
          set_names(res_col_order)
      }) %>%
      as.data.frame()
  })
  
  return(list(output = output_tidy_pmap,
              time = tidy_pmap_time))
}

# original example
input_1 <- data.frame(stringsAsFactors = FALSE,
                      s = letters[1:6],
                      C = LETTERS[1:6])
k_1 <- 3
base_1 <- base_timing(input_dataframe = input_1,
                      no_groups = k_1)
map_1 <- map_timing(input_dataframe = input_1,
                    no_groups = k_1)
pmap_1 <- pmap_timing(input_dataframe = input_1,
                      no_groups = k_1)
all.equal(target = base_1$output,
          current = map_1$output)
#> [1] TRUE
all.equal(target = base_1$output,
          current = pmap_1$output)
#> [1] TRUE
list(base = base_1$time,
     map = map_1$time,
     pmap = pmap_1$time)
#> $base
#>    user  system elapsed 
#>    0.02    0.00    0.02 
#> 
#> $map
#>    user  system elapsed 
#>    0.30    0.00    0.31 
#> 
#> $pmap
#>    user  system elapsed 
#>    0.03    0.00    0.03

# small example
input_2 <- data.frame(stringsAsFactors = FALSE,
                      small = month.abb,
                      full = month.name)
k_2 <- 7
base_2 <- base_timing(input_dataframe = input_2,
                      no_groups = k_2)
map_2 <- map_timing(input_dataframe = input_2,
                    no_groups = k_2)
pmap_2 <- pmap_timing(input_dataframe = input_2,
                      no_groups = k_2)
all.equal(target = base_2$output,
          current = map_2$output)
#> [1] TRUE
all.equal(target = base_2$output,
          current = pmap_2$output)
#> [1] TRUE
list(base = base_2$time,
     map = map_2$time,
     pmap = pmap_2$time)
#> $base
#>    user  system elapsed 
#>    0.33    0.00    0.33 
#> 
#> $map
#>    user  system elapsed 
#>    9.33    0.02    9.49 
#> 
#> $pmap
#>    user  system elapsed 
#>    4.39    0.00    4.50

# numeric example
input_3 <- data.frame(stringsAsFactors = FALSE,
                      a = 1:10,
                      b = 10:1)
k_3 <- 5
base_3 <- base_timing(input_dataframe = input_3,
                      no_groups = k_3)
map_3 <- map_timing(input_dataframe = input_3,
                    no_groups = k_3)
pmap_3 <- pmap_timing(input_dataframe = input_3,
                      no_groups = k_3)
all.equal(target = base_3$output,
          current = map_3$output)
#> [1] TRUE
all.equal(target = base_3$output,
          current = pmap_3$output)
#> [1] TRUE
list(base = base_3$time,
     map = map_3$time,
     pmap = pmap_3$time)
#> $base
#>    user  system elapsed 
#>    0.06    0.00    0.06 
#> 
#> $map
#>    user  system elapsed 
#>    2.78    0.00    2.78 
#> 
#> $pmap
#>    user  system elapsed 
#>       1       0       1

^{Created on 2019-03-24 by the reprex package (v0.2.1)}

As you can see, base approach is better than both tidy approaches, and pmap is better than map. Can these two be modified to perform better? I tried with a dataframe of 20 rows and 7 groups, and the time difference is really significant.

> input <- data.frame(stringsAsFactors = FALSE,
+                     a = 1:20,
+                     b = 20:1)
> k <- 7
> base_time <- base_timing(input_dataframe = input,
+                          no_groups = k)
> pmap_time <- pmap_timing(input_dataframe = input,
+                          no_groups = k)
> all.equal(target = base_time$output,
+           current = pmap_time$output)
[1] TRUE
> list(base = base_time$time,
+      pmap = pmap_time$time)
$base
   user  system elapsed 
  30.80    0.03   31.23 

$pmap
   user  system elapsed 
 394.09    0.27  397.22

Sorry, it's not copy-paste friendly. Please use reprex::reprex_rescue() for this part.

nathania · March 24, 2019, 4:23pm

Hi! You can also try this tidyverse alternative.

rows <- input %>% group_by_all() %>% group_split()
row_combinations <- t(combn(x = 1:nrow(input), m = k)) %>% as_tibble()
row_combinations %>%
  mutate_all(~ map(., ~ pluck(rows, .x))) %>% 
  unnest()

Full reprex below. I'm not sure how it performs relative to base R, but I typically prefer the most readable and/or compact option unless I'm optimizing a bottleneck. Cheers!

library(tidyverse)

input <- data.frame(stringsAsFactors = FALSE,
                    s = letters[1:6],
                    C = LETTERS[1:6])
expected_output <- tibble(s1 = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b",
                                 "b", "b", "b", "b", "c", "c", "c", "d"),
                          C1 = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B",
                                 "B", "B", "B", "B", "C", "C", "C", "D"),
                          s2 = c("b", "b", "b", "b", "c", "c", "c", "d", "d", "e", "c", "c",
                                 "c", "d", "d", "e", "d", "d", "e", "e"),
                          C2 = c("B", "B", "B", "B", "C", "C", "C", "D", "D", "E", "C", "C",
                                 "C", "D", "D", "E", "D", "D", "E", "E"),
                          s3 = c("c", "d", "e", "f", "d", "e", "f", "e", "f", "f", "d", "e",
                                 "f", "e", "f", "f", "e", "f", "f", "f"),
                          C3 = c("C", "D", "E", "F", "D", "E", "F", "E", "F", "F", "D", "E",
                                     "F", "E", "F", "F", "E", "F", "F", "F"))
k <- 3

rows <- input %>% group_by_all() %>% group_split()
row_combinations <- t(combn(x = 1:nrow(input), m = k)) %>% as_tibble()
row_combinations %>%
  mutate_all(~ map(., ~ pluck(rows, .x))) %>% 
  unnest() %>% 
  set_names(names(expected_output)) %>% 
  all.equal(expected_output)
#> [1] TRUE

^{Created on 2019-03-24 by the reprex package (v0.2.1)}

Yarnabrina · March 24, 2019, 5:52pm

This one's great, thanks!

It's much faster than the previous solution by Christophe. Actually, I couldn't totally follow his approaches, but I can follow yours.

It turns out to be faster than the base-R approach too. See below:

# functions to check the required time

## for my approach
base_timing <- function(input_dataframe,
                        no_groups)
{
  base_time <- system.time(expr = {
    # all possible combinations of the row indices of size no_group
    combinations_matrix <- combn(x = nrow(x = input_dataframe),
                                 m = no_groups)
    
    # I need to rename the columns
    output_base_r <- data.frame(t(x = apply(X = combinations_matrix,
                                            MARGIN = 2,
                                            FUN = function(counter) 
                                            {
                                              rbind(sapply(X = counter,
                                                           FUN = function(t)
                                                           {
                                                             unlist(x = input_dataframe[t, ])
                                                           }))
                                            })),
                                stringsAsFactors = FALSE)
    
    # setting column names
    colnames(x = output_base_r) <- c(do.call(what = rbind,
                                             args = lapply(X = colnames(x = input_dataframe),
                                                           FUN = function(x)
                                                           {
                                                             paste0(x, seq(no_groups))
                                                           })))
  })
  
  return(list(output = output_base_r,
              time = base_time))
}

## for pluck approach
pluck_timing <- function(input_dataframe,
                         no_groups)
{
  # loading tidyverse
  library(tidyverse)
  
  tidy_pluck_time <- system.time({
    
    # col order for the result (depends on no of groups)
    res_col_order <- seq_len(length.out = no_groups) %>%
      map(~ paste0(colnames(x = input_dataframe), .x)) %>%
      flatten_chr()
    
    rows <- input_dataframe %>%
      group_by_all() %>%
      group_split()
    
    row_combinations <-
      t(x = combn(x = nrow(x = input_dataframe),
                  m = no_groups)) %>%
      as_tibble()
    
    output_tidy_pluck <-
      row_combinations %>%
      mutate_all(~ map(., ~ pluck(rows, .x))) %>% 
      unnest() %>%
      set_names(res_col_order) %>%
      as.data.frame()
  })
  
  return(list(output = output_tidy_pluck,
              time = tidy_pluck_time))
}

# numeric example
input_1 <- data.frame(a = 1:20,
                      b = 20:1)
k_1 <- 7
base_1 <- base_timing(input_dataframe = input_1,
                      no_groups = k_1)
pluck_1 <- pluck_timing(input_dataframe = input_1,
                        no_groups = k_1)
#> Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
#> This warning is displayed once per session.
all.equal(target = base_1$output,
          current = pluck_1$output)
#> [1] TRUE
list(base = base_1$time,
     pluck = pluck_1$time)
#> $base
#>    user  system elapsed 
#>   28.60    0.05   28.92 
#> 
#> $pluck
#>    user  system elapsed 
#>    9.27    0.03    9.40

# original example
input_2 <- data.frame(stringsAsFactors = FALSE,
                      small = letters[1:10],
                      capital = LETTERS[1:10])
k_2 <- 5
base_2 <- base_timing(input_dataframe = input_2,
                      no_groups = k_2)
pluck_2 <- pluck_timing(input_dataframe = input_2,
                        no_groups = k_2)
all.equal(target = base_2$output,
          current = pluck_2$output)
#> [1] TRUE
list(base = base_2$time,
     pluck = pluck_2$time)
#> $base
#>    user  system elapsed 
#>    0.06    0.01    0.08 
#> 
#> $pluck
#>    user  system elapsed 
#>    0.03    0.00    0.04

^{Created on 2019-03-24 by the reprex package (v0.2.1)}

Thanks, once again.

Yarnabrina · March 31, 2019, 5:52pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.