How can I join two tables with an OR statement in R using dplyr's join functions?

E.g. Join df1 on df2 with the key:
df1_ColumnA == df2_ColumnA OR df1_ColumnA == df2_ColumnB?

library(dplyr)

df1 = data.frame(V1 = c('A', 'B', 'C','D'), v2 = c(1,2,3,4))
df2 = data.frame(VA = c('A', 'B', 'C'), VB = c('D', 'E', 'F'), vC = c(4,5,6))

## Join on column 1
df3 = df1 %>% left_join(df2, by = c('V1' = 'VA'))

## join on column 1 AND column 2
df4 = df1 %>% left_join(df2, by = c('V1' = 'VA', 'V1' = 'VB'))

This is not working:

## join on column 1 OR column 2
df4 = df1 %>% left_join(df2, by = c('V1' = 'VA' | 'V1' = 'VB'))

Edit: expected output

   V1 v2 VA VB vC
1  A  1  A  D  4
2  B  2  B  E  5
3  C  3  C  F  6
4  D  4  A  D  4

Note: This question has been asked on SO here by Koot6133

This is a little hacky but you can use fuzzyjoin::regex_left_join()

library(dplyr)
library(fuzzyjoin)

df1 = data.frame(V1 = c('A', 'B', 'C','D'), v2 = c(1,2,3,4))
df2 = data.frame(VA = c('A', 'B', 'C'), VB = c('D', 'E', 'F'), vC = c(4,5,6))

df1 %>%  
    regex_left_join(df2 %>%
                         rowwise() %>% 
                         mutate(key = paste(VA, VB, sep = "|")),
                     by = c(V1 = "key")) %>% 
    select(-key) %>% 
    arrange(V1)
#>   V1 v2 VA VB vC
#> 1  A  1  A  D  4
#> 2  B  2  B  E  5
#> 3  C  3  C  F  6
#> 4  D  4  A  D  4

Created on 2019-08-16 by the reprex package (v0.3.0.9000)

3 Likes

This answers the question as asked, but might have a problem generalizing to other situations:

library(dplyr)
library(tidyr)
library(purrr)

df1 = data.frame(V1 = c('A', 'B', 'C', 'D'), v2 = c(1,2,3,4))
df2 = data.frame(VA = c('A', 'B', 'C'), VB = c('D', 'E', 'F'), vC = c(4,5,6))

joined_df <- df1 %>%
  nest_join(df2, by = c("V1" = "VA"), keep = TRUE, name = "first_join_condition") %>%
  nest_join(df2, by = c("V1" = "VB"), keep = TRUE, name = "second_join_condition") %>%
  mutate(ored_join_condition = map2(first_join_condition, second_join_condition, function(x, y) if(nrow(x) == 0) y else x)) %>%
  select(-first_join_condition, -second_join_condition) %>%
  unnest(ored_join_condition) %>%
  full_join(df1, by = c("V1", "v2")) # add back any rows from df1 which didn't match on either condition

I'm not entirely sure, but it feels like this will cause problems if there are multiple lines in df2 which match on both conditions (they would appear duplicated in the output). This will work as a starting point, though!

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.