dplyr : performing a union of matches within 2 columns using group_split

yeshipants · May 1, 2019, 12:46am

I need my dataframe put into items of a list so each potential person is in a 'bucket', based on 2 columns ID and ID2_value . Then I'll perform user validation manually to confirm same person or not.

multiple rows with the same ID2_value is potentially the same person - they need to be put together. multiple rows with the same ID is definitely the same person.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

dat <- structure(list(fname_c1 = structure(c(5L, 3L, 4L, 1L, 4L, 5L, 2L, 2L),
                                           .Label = c("BOB", "GARY", "HANS", "PODRICK", "STEFAN"),
                                           class = "factor"),
                      ID = structure(c(1L, 4L, 2L, 5L, 2L, 1L, 2L, 3L),
                                     .Label = c("100", "104", "110", "205", "600"),
                                     class = "factor"),
                      ID2 = structure(c(1L, 1L, 2L, 2L, 1L, 2L, 3L, 3L),
                                      .Label = c("MEDICAL#1", "MEDICAL#2", "MEDICAL#4"),
                                      class = "factor"),
                      ID2_value = structure(c(4L, 4L, 3L, 2L, 5L, 1L, 6L, 6L),
                                            .Label = c("1001", "2005", "5003", "60", "89", "AAB"),
                                            class = "factor"),
                      by = c(1957L, 1957L, 1933L, 1960L, 1933L, 1957L, 1933L, 1933L),
                      bm = c(9L, 6L, 6L, 9L, 9L, 6L, 6L, 6L),
                      bd = c(2L, 7L, 7L, 2L, 2L, 1L, 1L,1L)
)
)

dat <- as_tibble(dat)



dat %>% 
  group_split(ID, ID2_value) 
#> [[1]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 STEFAN   100   MEDICAL#2 1001       1957     6     1
#> 
#> [[2]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 STEFAN   100   MEDICAL#1 60         1957     9     2
#> 
#> [[3]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 PODRICK  104   MEDICAL#2 5003       1933     6     7
#> 
#> [[4]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 PODRICK  104   MEDICAL#1 89         1933     9     2
#> 
#> [[5]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 GARY     104   MEDICAL#4 AAB        1933     6     1
#> 
#> [[6]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 GARY     110   MEDICAL#4 AAB        1933     6     1
#> 
#> [[7]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 HANS     205   MEDICAL#1 60         1957     6     7
#> 
#> [[8]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 BOB      600   MEDICAL#2 2005       1960     9     2

However, the output I'm looking for is this :

fname_c1   ID         ID2    ID2_value   by    bm    bd 
  <fct>   <fct>      <fct>    <fct>     <int> <int> <int>  
1 STEFAN   100       MEDICAL#1  60        1957     9     2   
2 HANS     205       MEDICAL#1  60        1957     6     7   
3 STEFAN   100       MEDICAL#2  1001      1957     6     1   

fname_c1   ID         ID2    ID2_value   by    bm    bd   
4 BOB      600       MEDICAL#2  2005      1960     9     2   

fname_c1   ID         ID2    ID2_value   by    bm    bd   
5 PODRICK  104       MEDICAL#1  89        1933     9     2  
6 PODRICK  104       MEDICAL#2  5003      1933     6     7  
7 GARY     104       MEDICAL#4  AAB       1933     6     1  
8 GARY     110       MEDICAL#4  AAB       1933     6     1

andresrcs · May 1, 2019, 7:31pm

This produces the desired output, but it's not an optimal solution, hopefully would clarify the logic so other people could give you a better solution.

library(tidyverse)
library(fuzzyjoin)
dat <- structure(list(fname_c1 = structure(c(5L, 3L, 4L, 1L, 4L, 5L, 2L, 2L),
                                           .Label = c("BOB", "GARY", "HANS", "PODRICK", "STEFAN"),
                                           class = "factor"),
                      ID = structure(c(1L, 4L, 2L, 5L, 2L, 1L, 2L, 3L),
                                     .Label = c("100", "104", "110", "205", "600"),
                                     class = "factor"),
                      ID2 = structure(c(1L, 1L, 2L, 2L, 1L, 2L, 3L, 3L),
                                      .Label = c("MEDICAL#1", "MEDICAL#2", "MEDICAL#4"),
                                      class = "factor"),
                      ID2_value = structure(c(4L, 4L, 3L, 2L, 5L, 1L, 6L, 6L),
                                            .Label = c("1001", "2005", "5003", "60", "89", "AAB"),
                                            class = "factor"),
                      by = c(1957L, 1957L, 1933L, 1960L, 1933L, 1957L, 1933L, 1933L),
                      bm = c(9L, 6L, 6L, 9L, 9L, 6L, 6L, 6L),
                      bd = c(2L, 7L, 7L, 2L, 2L, 1L, 1L,1L)
)
)

dat <- as_tibble(dat)

ids <- dat %>% 
    group_by(ID2_value) %>% 
    summarise(id_regex = paste0(ID, collapse = "|"), n = n()) %>% 
    filter(n > 1) %>% 
    select(id_regex) %>% 
    mutate(group = row_number())

dat %>% 
    regex_left_join(ids, by = c(ID = "id_regex")) %>% 
    arrange(group) %>% 
    mutate(group = if_else(is.na(group), row_number(), group)) %>% 
    select(-id_regex) %>%
    group_split(group)
#> [[1]]
#> # A tibble: 3 x 8
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd group
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int> <int>
#> 1 STEFAN   100   MEDICAL#1 60         1957     9     2     1
#> 2 HANS     205   MEDICAL#1 60         1957     6     7     1
#> 3 STEFAN   100   MEDICAL#2 1001       1957     6     1     1
#> 
#> [[2]]
#> # A tibble: 4 x 8
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd group
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int> <int>
#> 1 PODRICK  104   MEDICAL#2 5003       1933     6     7     2
#> 2 PODRICK  104   MEDICAL#1 89         1933     9     2     2
#> 3 GARY     104   MEDICAL#4 AAB        1933     6     1     2
#> 4 GARY     110   MEDICAL#4 AAB        1933     6     1     2
#> 
#> [[3]]
#> # A tibble: 1 x 8
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd group
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int> <int>
#> 1 BOB      600   MEDICAL#2 2005       1960     9     2     8

yeshipants · May 2, 2019, 7:44am

Thanks! HANS and STEFAN are in the same group because they have the same ID of 100.

yeshipants · May 2, 2019, 10:57am

Oh sorry, I meant HANS and STEFAN have the same ID2_value of 60, that's why they are matched.
I basically need all potential same-person records together.
And they are considered potentially same if they have the same ID and ID2_value.
So it's like a union of matching.
Match ID Union Match ID2_value

andresrcs · May 2, 2019, 12:46pm

Maybe the confusion comes from the use of "and" in your statement, I think "Or" it's more appropriate.

they have the same ID or ID2_value

Yarnabrina · May 2, 2019, 12:56pm

I completely agree with this comment. I completely misunderstood the problem. I've deleted my earlier answers because those are irrelevant after this information.

Based on this SO answer, a way to do what you want is as follows:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(igraph)
#> 
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:dplyr':
#> 
#>     as_data_frame, groups, union
#> The following objects are masked from 'package:stats':
#> 
#>     decompose, spectrum
#> The following object is masked from 'package:base':
#> 
#>     union

dat <- tibble(fname_c1 = factor(x = c(5L, 3L, 4L, 1L, 4L, 5L, 2L, 2L),
                                labels = c("BOB", "GARY", "HANS", "PODRICK", "STEFAN")),
              ID = factor(x = c(1L, 4L, 2L, 5L, 2L, 1L, 2L, 3L),
                          labels = c("100", "104", "110", "205", "600")),
              ID2 = factor(x = c(1L, 1L, 2L, 2L, 1L, 2L, 3L, 3L),
                           labels = c("MEDICAL#1", "MEDICAL#2", "MEDICAL#4")),
              ID2_value = factor(x = c(4L, 4L, 3L, 2L, 5L, 1L, 6L, 6L),
                                 labels = c("1001", "2005", "5003", "60", "89", "AAB")),
              by = c(1957L, 1957L, 1933L, 1960L, 1933L, 1957L, 1933L, 1933L),
              bm = c(9L, 6L, 6L, 9L, 9L, 6L, 6L, 6L),
              bd = c(2L, 7L, 7L, 2L, 2L, 1L, 1L,1L))

indices <- clusters(graph = graph_from_data_frame(d = data.frame(dat$ID, dat$ID2_value)))$membership

temp1 <- data.frame(ID = names(x = indices),
                    temp2 = indices) %>%
  filter(ID %in% dat$ID) %>%
  mutate(ID = droplevels(x = ID))

dat %>%
  left_join(temp1) %>%
  group_split(temp2, keep = FALSE)
#> Joining, by = "ID"
#> [[1]]
#> # A tibble: 3 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 STEFAN   100   MEDICAL#1 60         1957     9     2
#> 2 HANS     205   MEDICAL#1 60         1957     6     7
#> 3 STEFAN   100   MEDICAL#2 1001       1957     6     1
#> 
#> [[2]]
#> # A tibble: 4 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 PODRICK  104   MEDICAL#2 5003       1933     6     7
#> 2 PODRICK  104   MEDICAL#1 89         1933     9     2
#> 3 GARY     104   MEDICAL#4 AAB        1933     6     1
#> 4 GARY     110   MEDICAL#4 AAB        1933     6     1
#> 
#> [[3]]
#> # A tibble: 1 x 7
#>   fname_c1 ID    ID2       ID2_value    by    bm    bd
#>   <fct>    <fct> <fct>     <fct>     <int> <int> <int>
#> 1 BOB      600   MEDICAL#2 2005       1960     9     2

yeshipants · May 15, 2019, 12:18am

Sorry, second language

system · June 5, 2019, 12:18am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.