Missing Data in one table from another

I have 2 data tables. One has lots of variables for many groups. Each observation has a unique ID. The other contains only a few variables for a few groups. It has the same IDs as the first table. The variables in the second table are present in the first but are always missing for the groups in the second. I need to replace the missing values in table 1 with those in table 2. There needs to be some sort of join but I can not figure out the correct way to do it. In the example below df1 + df2 need to become df3.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df1 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(NA, NA, NA, NA, NA, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(NA, NA, NA, NA, NA, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df2 <- structure(list(id = 1:5, 
                      group = c("a", "a", "a", "b", "b"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L)), 
                 row.names = c(NA, -5L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df3 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))
df1
#> # A tibble: 10 x 6
#>       id group  var1  var2 var3  var4 
#>    <int> <chr> <int> <int> <chr> <chr>
#>  1     1 a        NA    NA O     t    
#>  2     2 a        NA    NA X     v    
#>  3     3 a        NA    NA H     u    
#>  4     4 b        NA    NA S     p    
#>  5     5 b        NA    NA T     s    
#>  6     6 c        58    84 E     m    
#>  7     7 c        NA    NA U     k    
#>  8     8 c        23    89 L     f    
#>  9     9 d        NA    NA I     e    
#> 10    10 d        30    91 B     g
df2
#> # A tibble: 5 x 4
#>      id group  var1  var2
#>   <int> <chr> <int> <int>
#> 1     1 a         2    18
#> 2     2 a         8    15
#> 3     3 a        46     2
#> 4     4 b        14    63
#> 5     5 b        92    28
df3 
#> # A tibble: 10 x 6
#>       id group  var1  var2 var3  var4 
#>    <int> <chr> <int> <int> <chr> <chr>
#>  1     1 a         2    18 O     t    
#>  2     2 a         8    15 X     v    
#>  3     3 a        46     2 H     u    
#>  4     4 b        14    63 S     p    
#>  5     5 b        92    28 T     s    
#>  6     6 c        58    84 E     m    
#>  7     7 c        NA    NA U     k    
#>  8     8 c        23    89 L     f    
#>  9     9 d        NA    NA I     e    
#> 10    10 d        30    91 B     g
1 Like

I would use a left join and then replace the original if it is NA.

library(dplyr)

df1 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(NA, NA, NA, NA, NA, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(NA, NA, NA, NA, NA, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df2 <- structure(list(id = 1:5, 
                      group = c("a", "a", "a", "b", "b"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L)), 
                 row.names = c(NA, -5L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df3 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df3_candidate <- left_join(df1, df2, by = "id") #notice .x and .y suffixes!
df3_candidate <- df3_candidate %>% mutate(var1 = ifelse(is.na(var1.x), var1.y, var1.x),
                                          var2 = ifelse(is.na(var2.x), var2.y, var2.x)) %>% 
  select(id, group = group.x, var1, var2, var3, var4)
df3_candidate
#> # A tibble: 10 x 6
#>       id group  var1  var2 var3  var4 
#>    <int> <chr> <int> <int> <chr> <chr>
#>  1     1 a         2    18 O     t    
#>  2     2 a         8    15 X     v    
#>  3     3 a        46     2 H     u    
#>  4     4 b        14    63 S     p    
#>  5     5 b        92    28 T     s    
#>  6     6 c        58    84 E     m    
#>  7     7 c        NA    NA U     k    
#>  8     8 c        23    89 L     f    
#>  9     9 d        NA    NA I     e    
#> 10    10 d        30    91 B     g
df3
#> # A tibble: 10 x 6
#>       id group  var1  var2 var3  var4 
#>    <int> <chr> <int> <int> <chr> <chr>
#>  1     1 a         2    18 O     t    
#>  2     2 a         8    15 X     v    
#>  3     3 a        46     2 H     u    
#>  4     4 b        14    63 S     p    
#>  5     5 b        92    28 T     s    
#>  6     6 c        58    84 E     m    
#>  7     7 c        NA    NA U     k    
#>  8     8 c        23    89 L     f    
#>  9     9 d        NA    NA I     e    
#> 10    10 d        30    91 B     g

Created on 2019-09-12 by the reprex package (v0.2.1)

1 Like

Hi @carolineL. I would like to suggest a script that not using join for your reference. The coalesce can compare vectors and return the first non NA value.

library(dplyr)

df1 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(NA, NA, NA, NA, NA, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(NA, NA, NA, NA, NA, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df2 <- structure(list(id = 1:5, 
                      group = c("a", "a", "a", "b", "b"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L)), 
                 row.names = c(NA, -5L), 
                 class = c("tbl_df", "tbl", "data.frame"))

df3 <- structure(list(id = 1:10, 
                      group = c("a", "a", "a", "b", "b", 
                                "c", "c", "c", "d", "d"), 
                      var1 = c(2L, 8L, 46L, 14L, 92L, 58L, 
                               NA, 23L, NA, 30L), 
                      var2 = c(18L, 15L, 2L, 63L, 28L, 84L,
                               NA, 89L, NA, 91L), 
                      var3 = c("O", "X", "H", "S", "T",
                               "E", "U", "L", "I", "B"), 
                      var4 = c("t", "v", "u", "p", "s",
                               "m", "k", "f", "e", "g")), 
                 row.names = c(NA, -10L), 
                 class = c("tbl_df", "tbl", "data.frame"))

union_all(df2, df1) %>%
  group_by(id, group) %>%
  summarise_all(~{coalesce(.x[1], .x[2])})
#> # A tibble: 10 x 6
#> # Groups:   id [?]
#>       id group  var1  var2 var3  var4 
#>    <int> <chr> <int> <int> <chr> <chr>
#>  1     1 a         2    18 O     t    
#>  2     2 a         8    15 X     v    
#>  3     3 a        46     2 H     u    
#>  4     4 b        14    63 S     p    
#>  5     5 b        92    28 T     s    
#>  6     6 c        58    84 E     m    
#>  7     7 c        NA    NA U     k    
#>  8     8 c        23    89 L     f    
#>  9     9 d        NA    NA I     e    
#> 10    10 d        30    91 B     g

Created on 2019-09-13 by the reprex package (v0.3.0)

2 Likes

Thank you for you suggestion - FJCC. I considered this. While I only included 2 variables to the keep the reprex simple - my actual variable group is 10 and so it can become a little unruly/redundant etc.