unnesting a list-column while preserving the position in the list

von_olaf · May 23, 2019, 4:33pm

I am struggling with a simple question.
Consider this example

> tibble(tricky = '[10,20,30,40]') %>% 
+   mutate(mylist = map(tricky, ~jsonlite::parse_json(.x)))
# A tibble: 1 x 2
  tricky        mylist    
  <chr>         <list>    
1 [10,20,30,40] <list [4]>

As you can see, I am able to convert the python list [10,20,30,40] into a list-column.

Here the idea is that element 10 corresponds to index position 1, 20 corrresponds to index position 2, etc. I other words, the order in the original tricky character is extremely important.

I am trying to unlist that variable so that the tibble contains both the element and its index position:

# A tibble: 4 x 2
    idx value
  <dbl> <dbl>
1     1    10
2     2    20
3     3    30
4     4    40

How can I do that? I tried with some variations like

tibble(tricky = '[10,20,30,40]') %>% 
  mutate(mylist = map(tricky, ~jsonlite::parse_json(.x))) %>% 
  mutate(test = map(mylist, ~unlist(.x, use.names = TRUE)))

with no success. Thanks!

Yarnabrina · May 23, 2019, 5:53pm

Are any of these useful?

library(magrittr)

temp <- tibble::tibble(tricky = '[10,20,30,40]') %>%
  dplyr::mutate(tricky = list(jsonlite::parse_json(json = tricky)))

temp %>%
  tidyr::unnest() %>%
  tidyr::unnest()
#> # A tibble: 4 x 1
#>   tricky
#>    <int>
#> 1     10
#> 2     20
#> 3     30
#> 4     40

unlist(x = temp) # you can use use.names = FALSE here
#> tricky1 tricky2 tricky3 tricky4 
#>      10      20      30      40

^{Created on 2019-05-23 by the reprex package (v0.3.0)}

von_olaf · May 23, 2019, 6:05pm

Thanks @Yarnabrina! Now I realize my question was not clear enough. Consider this different output


   tibble::tibble(tricky = c('[10,20,30,40]',
                            '[100,200,300,400]',
                            '[2,4,6,8]'),
                 group = c('a', 'a', 'b'))
# A tibble: 3 x 2
  tricky            group
  <chr>             <chr>
1 [10,20,30,40]     a    
2 [100,200,300,400] a    
3 [2,4,6,8]         b

Now, to achieve what I want to do, I was able to use the following


  tibble::tibble(tricky = c('[10,20,30,40]',
                            '[100,200,300,400]',
                            '[2,4,6,8]'),
                 group = c('a', 'a', 'b')) %>% 
    mutate(mylist = map(tricky, ~jsonlite::parse_json(.x))) %>% unnest(mylist) %>% 
    mutate(mylist = as.numeric(mylist)) %>% group_by(tricky) %>% 
    mutate(idx = row_number()) %>% ungroup()
  # A tibble: 12 x 4
   tricky            group mylist   idx
   <chr>             <chr>  <dbl> <int>
 1 [10,20,30,40]     a         10     1
 2 [10,20,30,40]     a         20     2
 3 [10,20,30,40]     a         30     3
 4 [10,20,30,40]     a         40     4
 5 [100,200,300,400] a        100     1
 6 [100,200,300,400] a        200     2
 7 [100,200,300,400] a        300     3
 8 [100,200,300,400] a        400     4
 9 [2,4,6,8]         b          2     1
10 [2,4,6,8]         b          4     2
11 [2,4,6,8]         b          6     3
12 [2,4,6,8]         b          8     4

However, this looks super clunky and non-efficient to me. Also, the as.numeric(mylist) seems odd and importantly, I am not entirely sure that my grouping by tricky will preserve the row-order so that my index is the right one.

You see what I mean? Can we do better here?
Thanks!!!

Yarnabrina · May 23, 2019, 6:42pm

Sorry, but I can't do any better than this:

library(magrittr)

tibble::tibble(tricky = c('[10,20,30,40]',
                          '[100,200,300,400]',
                          '[2,4,6,8]'),
               group = c('a', 'a', 'b')) %>%
  dplyr::group_by(tricky) %>%
  dplyr::mutate(mylist = list(jsonlite::parse_json(json = tricky,
                                                   simplifyVector = TRUE))) %>%
  tidyr::unnest() %>%
  dplyr::mutate(idx = dplyr::row_number()) %>%
  dplyr::ungroup()
#> # A tibble: 12 x 4
#>    tricky            group mylist   idx
#>    <chr>             <chr>  <int> <int>
#>  1 [10,20,30,40]     a         10     1
#>  2 [10,20,30,40]     a         20     2
#>  3 [10,20,30,40]     a         30     3
#>  4 [10,20,30,40]     a         40     4
#>  5 [100,200,300,400] a        100     1
#>  6 [100,200,300,400] a        200     2
#>  7 [100,200,300,400] a        300     3
#>  8 [100,200,300,400] a        400     4
#>  9 [2,4,6,8]         b          2     1
#> 10 [2,4,6,8]         b          4     2
#> 11 [2,4,6,8]         b          6     3
#> 12 [2,4,6,8]         b          8     4

I hope others will provide a better way.

von_olaf · May 23, 2019, 6:55pm

Interesting. Thanks! I think my solution is a bit faster but I am pretty sure something much better than be done here....

library(microbenchmark)

mydf <- tibble::tibble(tricky = c('[10,20,30,40]',
                          '[100,200,300,400]',
                          '[2,4,6,8]') ,
               group = c('a', 'a', 'b')) 

#make the tibble bigger
mydf_big <- mydf  %>% slice(rep(1:n(), each=100)) %>% 
  mutate(ID = row_number())

my proposal

> microbenchmark(
+ mydf_big %>% mutate(mylist = map(tricky, ~parse_json(.x))) %>% unnest(mylist) %>% 
+   mutate(mylist = as.numeric(mylist)) %>% group_by(ID) %>% 
+   mutate(idx = row_number()) %>% ungroup()
+ )
Unit: milliseconds
                                                                                                                                                                                          expr
 mydf_big %>% mutate(mylist = map(tricky, ~parse_json(.x))) %>%      unnest(mylist) %>% mutate(mylist = as.numeric(mylist)) %>%      group_by(ID) %>% mutate(idx = row_number()) %>% ungroup()
     min      lq     mean   median      uq     max neval
 10.2174 10.6551 11.88908 10.95675 11.2491 42.7496   100

@Yarnabrina

> microbenchmark(
+   mydf_big %>% group_by(ID) %>%
+     mutate(mylist = list(parse_json(json = tricky,
+                                                      simplifyVector = TRUE))) %>%
+     unnest() %>%
+     mutate(idx = row_number()) %>%
+     ungroup()
+ )
Unit: milliseconds
                                                                                                                                                                      expr
 mydf_big %>% group_by(ID) %>% mutate(mylist = list(parse_json(json = tricky,      simplifyVector = TRUE))) %>% unnest() %>% mutate(idx = row_number()) %>%      ungroup()
     min      lq     mean   median       uq     max neval
 14.4293 15.1698 16.76994 15.45455 15.83285 47.7144   100

joels · May 23, 2019, 10:00pm

How about this: We turn tricky into standard R string vectors and then unnest.

library(tidyverse)

x = tibble::tibble(tricky = c('[20,10,30,40]',
                              '[100,200,400,300]',
                              '[2,4,6,8]'),
                   group = c('a', 'a', 'b'))

x$tricky.parse = map(x$tricky, 
                     ~str_split(.x, "\\[|\\]|,") %>% unlist %>% .[. != ""])

x = x %>% unnest

x
#> # A tibble: 12 x 3
#>    tricky            group tricky.parse
#>    <chr>             <chr> <chr>       
#>  1 [20,10,30,40]     a     20          
#>  2 [20,10,30,40]     a     10          
#>  3 [20,10,30,40]     a     30          
#>  4 [20,10,30,40]     a     40          
#>  5 [100,200,400,300] a     100         
#>  6 [100,200,400,300] a     200         
#>  7 [100,200,400,300] a     400         
#>  8 [100,200,400,300] a     300         
#>  9 [2,4,6,8]         b     2           
#> 10 [2,4,6,8]         b     4           
#> 11 [2,4,6,8]         b     6           
#> 12 [2,4,6,8]         b     8

von_olaf · May 23, 2019, 10:44pm

thanks! I tried and surprisingly this is much slower...

joels · May 23, 2019, 10:49pm

I tried it on mydf_big and I'm getting median times of 7, 18, and 39 ms for my, your, and Yarnabrina's methods, respectively.

von_olaf · May 23, 2019, 10:57pm

ha! this is quite good. I tried with str_split so perhaps that's the reason why. I ll try again home. thanks for helping out!!!

von_olaf · May 23, 2019, 11:01pm

oh also I notice you don't have the indexes. how many ms you add by doing that?

joels · May 23, 2019, 11:06pm

I tested the code on (I hope) a more realistic data frame and my code is definitely slower:

set.seed(2)
big_df = tibble(tricky = replicate(1e4, paste0("[", paste(sample(10:99, sample(3:6, 1)),  collapse=","), "]")),
                group = sample(letters, 1e4, replace=TRUE)) %>% 
  mutate(ID = row_number())

microbenchmark(

  von_olaf = big_df %>% 
    mutate(mylist = map(tricky, ~parse_json(.x))) %>% 
    unnest(mylist) %>% 
    mutate(mylist = as.numeric(mylist)) %>% 
    group_by(ID) %>% 
    mutate(idx = row_number()) %>% ungroup(),

  yarnabrina = big_df %>% group_by(ID) %>%
    mutate(mylist = list(parse_json(json = tricky,
                                    simplifyVector = TRUE))) %>%
    unnest() %>%
    mutate(idx = row_number()) %>%
    ungroup(),
  
  joels = {
    big_df$ID =  1:nrow(big_df)
    big_df$tricky.parse = map(big_df$tricky, 
                              ~str_split(.x, "\\[|\\]|,") %>% 
                                unlist %>% .[. != ""])
    big_df %>% 
      unnest() %>% 
      group_by(ID) %>%
      mutate(idx = 1:n())
  },
  times=10
)

Unit: milliseconds
      expr      min       lq     mean   median       uq       max neval cld
  von_olaf 130.0069 133.5388 138.9852 135.0734 144.8285  159.0802    10 a  
yarnabrina 260.5363 306.8618 308.1676 311.3855 318.3167  329.1660    10  b 
     joels 850.7023 857.7821 897.5298 883.7812 901.4701 1070.7251    10   c

Yarnabrina · May 24, 2019, 3:11am

Well, I really don't understand the time difference between my code and that of von_olaf. I simplified the output of parse_json and used one unnest, while the other code doesn't simplify at first and then make one unnest and then changes to numeric. Why does that create a difference of around 0.1 seconds? Anyway, I really shouldn't comment as I have absolutely zero idea about JSON.

I made a small change in my code to improve the time just a little (still slower than von_olaf). And I added a ungroup at the end of joels's code.

benchmark on my laptop

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(jsonlite)
library(microbenchmark)
library(purrr)
#> 
#> Attaching package: 'purrr'
#> The following object is masked from 'package:jsonlite':
#> 
#>     flatten
library(stringr)
library(tidyr)

set.seed(seed = 31547)

big_df <- tibble(tricky = replicate(n = 1e+4,
                                    expr = paste0("[",
                                                  paste(sample(x = 10:99,
                                                               size = sample(x = 3:6,
                                                                             size = 1)),
                                                             collapse = ","),
                                                  "]")),
                 group = sample(x = letters,
                                size = 1e+4,
                                replace = TRUE)) %>% 
  mutate(ID = row_number())

microbenchmark(
  
  von_olaf = big_df %>% 
    mutate(mylist = map(.x = tricky,
                        .f = ~ parse_json(json = .x))) %>% 
    unnest(mylist) %>% 
    mutate(mylist = as.numeric(x = mylist)) %>% 
    group_by(ID) %>% 
    mutate(idx = row_number()) %>%
    ungroup(),
  
  yarnabrina = big_df %>%
    mutate(mylist = map(.x = tricky,
                        .f = ~ parse_json(json = .,
                                          simplifyVector = TRUE))) %>%
    unnest(mylist) %>%
    group_by(ID) %>%
    mutate(idx = row_number()) %>%
    ungroup(),
  
  joels = {
    big_df$ID =  1:nrow(x = big_df)
    big_df$mylist = map(.x = big_df$tricky, 
                              .f = ~ str_split(string = .x,
                                               pattern = "\\[|\\]|,") %>%
                                unlist %>%
                                .[. != ""])
    big_df %>%
      unnest() %>%
      group_by(ID) %>%
      mutate(idx = 1:n()) %>%
      ungroup()
  },
  
  times = 10
)
#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max
#>    von_olaf  235.4389  244.9595  252.0547  250.9641  257.7325  273.0128
#>  yarnabrina  341.0141  350.3662  377.3627  362.6797  410.0881  455.1565
#>       joels 1737.4052 1774.3023 1857.8967 1856.3894 1911.9649 1994.7617
#>  neval
#>     10
#>     10
#>     10

And, though not relevant to this post, may I request any one of you to explain the difference between . and .x inside a function in purrr?

mishabalyasin · May 24, 2019, 7:15am

There is no real difference when you have one argument that is changing, but with map2, for example, you'll need to use .x and .y respectively.

. comes from magrittr (i.e., %>%) and it's a placeholder for "result of a previous step".

von_olaf · May 24, 2019, 1:21pm

thats really fun isnt it? thanks @Yarnabrina for microbenchmarking again. Perhaps this is the best we can do here. I really dont know how to streamline the code more (except by using furrr::future_map())

cderv · May 25, 2019, 8:16am

I came up with that but don't know if it is faster or slower

tab <- tibble::tibble(tricky = c('[10,20,30,40]',
                          '[100,200,300,400]',
                          '[2,4,6,8]'),
               group = c('a', 'a', 'b'))

library(dplyr)
#> 
#> Attachement du package : 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
library(tidyr)

tab %>%
  mutate(my_list = map(tricky, jsonlite::fromJSON),
         my_idx = map(my_list, seq_along)
  ) %>%
  unnest()
#> # A tibble: 12 x 4
#>    tricky            group my_list my_idx
#>    <chr>             <chr>   <int>  <int>
#>  1 [10,20,30,40]     a          10      1
#>  2 [10,20,30,40]     a          20      2
#>  3 [10,20,30,40]     a          30      3
#>  4 [10,20,30,40]     a          40      4
#>  5 [100,200,300,400] a         100      1
#>  6 [100,200,300,400] a         200      2
#>  7 [100,200,300,400] a         300      3
#>  8 [100,200,300,400] a         400      4
#>  9 [2,4,6,8]         b           2      1
#> 10 [2,4,6,8]         b           4      2
#> 11 [2,4,6,8]         b           6      3
#> 12 [2,4,6,8]         b           8      4

^{Created on 2019-05-25 by the reprex package (v0.3.0.9000)}

Yarnabrina · May 25, 2019, 9:23am

That's a really cool use of seq_along. But unfortunately, it's slower. See the benchmark (created in a RStudio Cloud project) below:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(jsonlite)
library(microbenchmark)
library(purrr)
#> 
#> Attaching package: 'purrr'
#> The following object is masked from 'package:jsonlite':
#> 
#>     flatten
library(stringr)
library(tidyr)

set.seed(seed = 31547)

big_df <- tibble(tricky = replicate(n = 1e+4,
                                    expr = paste0("[",
                                                  paste(sample(x = 10:99,
                                                               size = sample(x = 3:6,
                                                                             size = 1)),
                                                        collapse = ","),
                                                  "]")),
                 group = sample(x = letters,
                                size = 1e+4,
                                replace = TRUE)) %>% 
  mutate(ID = row_number())

microbenchmark(
  
  von_olaf = big_df %>% 
    mutate(mylist = map(.x = tricky,
                        .f = ~ parse_json(json = .x))) %>% 
    unnest(mylist) %>% 
    mutate(mylist = as.numeric(x = mylist)) %>% 
    group_by(ID) %>% 
    mutate(idx = row_number()) %>%
    ungroup(),
  
  yarnabrina = big_df %>%
    mutate(mylist = map(.x = tricky,
                        .f = parse_json,
                        simplifyVector = TRUE)) %>%
    unnest(mylist) %>%
    group_by(ID) %>%
    mutate(idx = row_number()) %>%
    ungroup(),
  
  cderv = big_df %>%
    mutate(my_list = map(.x = tricky,
                         .f = fromJSON),
           my_idx = map(.x = my_list,
                        .f = seq_along)) %>%
    unnest(),
  
  joels = {
    big_df$ID =  1:nrow(x = big_df)
    big_df$mylist = map(.x = big_df$tricky, 
                        .f = ~ str_split(string = .x,
                                         pattern = "\\[|\\]|,") %>%
                          unlist %>%
                          .[. != ""])
    big_df %>%
      unnest() %>%
      group_by(ID) %>%
      mutate(idx = 1:n()) %>%
      ungroup()
  },
  
  times = 10
)
#> Unit: milliseconds
#>        expr      min       lq     mean    median        uq       max neval
#>    von_olaf 126.2928 130.8220 144.9810  138.1143  144.8329  208.5256    10
#>  yarnabrina 172.0955 179.6934 188.6127  181.2032  189.6586  239.0419    10
#>       cderv 256.7838 267.8682 290.5427  295.7280  306.4159  317.4832    10
#>       joels 964.3267 977.8859 998.4823 1004.4720 1012.0781 1038.3356    10

cderv · May 25, 2019, 9:43am

ok your looking at optimizing milliseconds here ! Not my league.
Some hints: parse_json will be faster that fromJSON here. Also, the least you let R code guessed, the better you'll be.

Personally, I prefer clarity of the code and readability for other. I find it easy to maintain and I am willing to lets some 100ms goes away for this.

von_olaf · May 25, 2019, 12:22pm

thanks everyone! This was really interesting. Can we agree I have some bragging rights here???

system · June 1, 2019, 12:22pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.