Coalesce rows and remove NAs but keep all non-NA rows for each unique group

RuReady · October 7, 2018, 10:37pm

Hi,

I want collapse rows in columns Month & Value together while removing all NAs

library(dplyr)

dat <- structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
          2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), Month = c("Jan", 
          "Feb", "Mar", "Jun", NA, NA, NA, NA, "Jan", "Mar", NA, NA, NA, 
          NA, NA, NA, "Feb", "Mar", "Jun", NA, NA, NA, NA, NA), Value = c(NA, 
          NA, NA, NA, "4", "5", "6", "4", NA, NA, NA, NA, "3", "2", NA, 
          NA, NA, NA, NA, NA, "8", "7", "4", NA)), row.names = c(NA, -24L
          ), class = c("tbl_df", "tbl", "data.frame"))
dat
#> # A tibble: 24 x 3
#>    Group Month Value
#>    <dbl> <chr> <chr>
#>  1     1 Jan   <NA> 
#>  2     1 Feb   <NA> 
#>  3     1 Mar   <NA> 
#>  4     1 Jun   <NA> 
#>  5     1 <NA>  4    
#>  6     1 <NA>  5    
#>  7     1 <NA>  6    
#>  8     1 <NA>  4    
#>  9     2 Jan   <NA> 
#> 10     2 Mar   <NA> 
#> # ... with 14 more rows

If I use coalesce_all_columns function below, it only keeps one row for each unique Group which is not what I'm looking for

coalesce_all_columns <- function(df) {
  return(coalesce(!!! as.list(df)))
}

dat %>% 
  group_by(Group) %>%
  summarise_all(coalesce_all_columns)
#> # A tibble: 3 x 3
#>   Group Month Value
#>   <dbl> <chr> <chr>
#> 1     1 Jan   4    
#> 2     2 Jan   3    
#> 3     3 Feb   8

I can get the desired output if I use data.table package but I'm also interested in a dplyr solution

library(data.table)
setDT(dat)[, lapply(.SD, na.omit), by = Group]

#>    Group Month Value
#> 1:     1   Jan     4
#> 2:     1   Feb     5
#> 3:     1   Mar     6
#> 4:     1   Jun     4
#> 5:     2   Jan     3
#> 6:     2   Mar     2
#> 7:     3   Feb     8
#> 8:     3   Mar     7
#> 9:     3   Jun     4

Any help is appreciated. Thank you!

MikeBadescu · October 7, 2018, 11:40pm

Is this what you have in mind? Please note that the solution will not work if the number of non-missing values in Month and Value do not match.

library(dplyr)

dat <- structure(list(
    Group = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
              2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), 
    Month = c("Jan", 
              "Feb", "Mar", "Jun", NA, NA, NA, NA, "Jan", "Mar", NA, NA, NA, 
              NA, NA, NA, "Feb", "Mar", "Jun", NA, NA, NA, NA, NA), 
    Value = c(NA, 
              NA, NA, NA, "4", "5", "6", "4", NA, NA, NA, NA, "3", "2", NA, 
              NA, NA, NA, NA, NA, "8", "7", "4", NA)), row.names = c(NA, -24L
              ), class = c("tbl_df", "tbl", "data.frame"))

coalesce_all_columns <- function(df) {
    
    tibble(
        Group = df$Group[1],
        Month = na.omit(df$Month),
        Value = na.omit(df$Value)
    )
}

dat %>% 
    group_by(Group) %>%
    do(coalesce_all_columns(.)) %>%
    ungroup()
#> # A tibble: 9 x 3
#>   Group Month Value
#>   <dbl> <chr> <chr>
#> 1     1 Jan   4    
#> 2     1 Feb   5    
#> 3     1 Mar   6    
#> 4     1 Jun   4    
#> 5     2 Jan   3    
#> 6     2 Mar   2    
#> 7     3 Feb   8    
#> 8     3 Mar   7    
#> 9     3 Jun   4

^{Created on 2018-10-07 by the reprex package (v0.2.1)}

RuReady · October 8, 2018, 1:15am

Thank you Mike!

That works but can you make the function more generic such as accepting any column names (not just Month or Value) or any number of columns (3, 4, etc) just like the data.table solution I posted?

MikeBadescu · October 8, 2018, 3:23am

It is not clear what is the ultimate goal and there are several paths:

provide the group columns (and apply na.omit to all the other columns)
provide the "coalesce" columns (but too many to type)

I picked an intermediary approach where the group columns are reasonably detected if not provided as an (character vector) argument. If you wish, you can add tidyeval (although I recommend to look at the tidyselect package). However, the big concern is the "un-matching" data in "coalesce" columns.

library(dplyr)

dat <- structure(list(
    Group = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
              2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), 
    Month = c("Jan", 
              "Feb", "Mar", "Jun", NA, NA, NA, NA, "Jan", "Mar", NA, NA, NA, 
              NA, NA, NA, "Feb", "Mar", "Jun", NA, NA, NA, NA, NA), 
    Value = c(NA, 
              NA, NA, NA, "4", "5", "6", "4", NA, NA, NA, NA, "3", "2", NA, 
              NA, NA, NA, NA, NA, "8", "7", "4", NA)), row.names = c(NA, -24L
              ), class = c("tbl_df", "tbl", "data.frame"))


coalesce_all_columns <- function(df, group_vars = NULL) {
    
    if (is.null(group_vars)) {
        group_vars <- 
            df %>%
            purrr::keep(~ dplyr::n_distinct(.x) == 1L) %>% 
            names()
    }
    
    msk <- colnames(df) %in% group_vars
    same_df <- df[1L, msk, drop = FALSE]
    coal_df <- df[, !msk, drop = FALSE] %>%
        purrr::map_dfc(na.omit)
    
    cbind(same_df, coal_df)
}


dat %>%
    group_by(Group) %>%
    do(coalesce_all_columns(.)) %>%
    ungroup()
#> # A tibble: 9 x 3
#>   Group Month Value
#>   <dbl> <chr> <chr>
#> 1     1 Jan   4    
#> 2     1 Feb   5    
#> 3     1 Mar   6    
#> 4     1 Jun   4    
#> 5     2 Jan   3    
#> 6     2 Mar   2    
#> 7     3 Feb   8    
#> 8     3 Mar   7    
#> 9     3 Jun   4


dat %>%
    group_by(Group) %>%
    do(coalesce_all_columns(., "Group")) %>%
    ungroup()
#> # A tibble: 9 x 3
#>   Group Month Value
#>   <dbl> <chr> <chr>
#> 1     1 Jan   4    
#> 2     1 Feb   5    
#> 3     1 Mar   6    
#> 4     1 Jun   4    
#> 5     2 Jan   3    
#> 6     2 Mar   2    
#> 7     3 Feb   8    
#> 8     3 Mar   7    
#> 9     3 Jun   4

^{Created on 2018-10-07 by the reprex package (v0.2.1)}

jcblum · October 8, 2018, 5:01am

If you're OK with a tidyr + dplyr + purr solution (as opposed to just dplyr), how about this?

library(tidyverse)

dat <- structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3), Month = c("Jan", "Feb", "Mar", "Jun", NA, NA, NA,
NA, "Jan", "Mar", NA, NA, NA, NA, NA, NA, "Feb", "Mar", "Jun", NA, NA, NA, NA,
NA), Value = c(NA, NA, NA, NA, "4", "5", "6", "4", NA, NA, NA, NA, "3", "2",
NA, NA, NA, NA, NA, NA, "8", "7", "4", NA)), row.names = c(NA, -24L ), class =
c("tbl_df", "tbl", "data.frame"))

dat %>% 
  nest(-Group) %>% 
  mutate(data = map(data, ~ map_dfc(., na.omit))) %>% 
  unnest()
#> # A tibble: 9 x 3
#>   Group Month Value
#>   <dbl> <chr> <chr>
#> 1     1 Jan   4    
#> 2     1 Feb   5    
#> 3     1 Mar   6    
#> 4     1 Jun   4    
#> 5     2 Jan   3    
#> 6     2 Mar   2    
#> 7     3 Feb   8    
#> 8     3 Mar   7    
#> 9     3 Jun   4

^{Created on 2018-10-08 by the reprex package (v0.2.1)}

RuReady · October 8, 2018, 3:21pm

Thank you again Mike! I'll go with @jcblum's solution because it's simpler

RuReady · October 8, 2018, 3:22pm

Perfect !!! Thank you !!!

jcblum · October 8, 2018, 5:02pm

I'm assuming you know for sure that there are never going to be any mismatches between the length of the non-NA values per column per group (like, maybe you're cleaning up some sort of join?). It's an interesting puzzle trying to sort out how to handle it if the columns are ragged, but like @MikeBadescu said, the possibilities quickly multiply.

Here's a naive solution to the ragged column situation that just pads out the shorter columns with NAs. Note that in cases where there was already a fully filled-in row, the correspondence will be broken! data.table does something different, recycling values from the shorter column instead of filling with NAs. The correspondence between values in rows that started out complete still gets broken.

library(tidyverse)

dat2 <- tribble(
  ~Group, ~Month, ~Value,
   1    , "Jan" ,   NA  ,   
   1    , "Feb" ,   NA  ,   
   1    , "Mar" ,   NA  ,   
   1    , "Jun" ,   NA  ,   
   1    ,   NA  ,  "4"  ,    
   1    ,   NA  ,  "5"  ,   
   1    ,   NA  ,  "6"  ,  
   1    ,   NA  ,   NA  ,
        
   2    , "Jan" ,   NA  ,   
   2    , "Mar" ,   NA  ,   
   2    ,   NA  ,   NA  ,   
   2    ,   NA  ,   NA  ,   
   2    ,   NA  ,  "3"  ,   
   2    ,   NA  ,  "2"  ,    
   2    ,   NA  ,  "1"  ,   
   2    ,   NA  ,   NA  ,
        
   3    , "Feb" ,   NA  ,   
   3    , "Mar" ,   NA  ,   
   3    , "Jun" ,   NA  ,    
   3    ,   NA  ,   NA  ,   
   3    ,   NA  ,  "8"  ,    
   3    , "Jul" ,  "7"  ,    
   3    ,   NA  ,  "4"  ,    
   3    , "Dec" ,   NA  )

# Collapse each column independently, and fill out
# the shorter columns with trailing NAs
collapse_fill_na <- function(dfr) {
  collapsed <- map(dfr, na.omit)
  max_len <- max(map_int(collapsed, length))
  map_if(collapsed,
         ~ length(.) < max_len,
         ~ c(., rep(NA_character_, max_len - length(.)))) %>% 
    bind_cols()
}

dat2 %>% 
  nest(-Group) %>% 
  mutate(data = map(data, collapse_fill_na)) %>% 
  unnest()
#> # A tibble: 12 x 3
#>    Group Month Value
#>    <dbl> <chr> <chr>
#>  1     1 Jan   4    
#>  2     1 Feb   5    
#>  3     1 Mar   6    
#>  4     1 Jun   <NA> 
#>  5     2 Jan   3    
#>  6     2 Mar   2    
#>  7     2 <NA>  1    
#>  8     3 Feb   8    
#>  9     3 Mar   7    
#> 10     3 Jun   4    
#> 11     3 Jul   <NA> 
#> 12     3 Dec   <NA>

# data.table recycles values instead (with a warning)
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
setDT(dat2)[, lapply(.SD, na.omit), by = Group]
#> Warning in `[.data.table`(setDT(dat2), , lapply(.SD, na.omit), by = Group):
#> Column 2 of result for group 1 is length 3 but the longest column in this
#> result is 4. Recycled leaving remainder of 1 items. This warning is once
#> only for the first group with this issue.
#>     Group Month Value
#>  1:     1   Jan     4
#>  2:     1   Feb     5
#>  3:     1   Mar     6
#>  4:     1   Jun     4
#>  5:     2   Jan     3
#>  6:     2   Mar     2
#>  7:     2   Jan     1
#>  8:     3   Feb     8
#>  9:     3   Mar     7
#> 10:     3   Jun     4
#> 11:     3   Jul     8
#> 12:     3   Dec     7

^{Created on 2018-10-08 by the reprex package (v0.2.1)}

RuReady · October 9, 2018, 5:20pm

Yes you were right. There would never be any mismatches between the length of the non-NA values per column per group because those were the results from tidyr::spread. But thank you for a throughout answer!