mutate data frame to check if data duplicate between two dataframe

I have two data frame, this is just a sample , database have approx 1 million of records.
can have name, email, alphanumeric code etc.

data1<-data.frame(
  'ID 1' = c(86364,"ARV_2612","AGH_2212","IND_2622","CHG_2622"),
  sector = c(3,3,1,2,5),
  name=c("nhug","hugy","mjuk","ghtr","kuld"),
  'Enternal code'=c(1,1,1,1,3),
  col3=c(1,1,0,0,0),
  col4=c(1,0,0,0,0),
  col5=c(1,0,1,1,1)
)

data2<-data.frame(
  'ID 1' = c(53265,"ARV_7362",76354,"IND_2622","CHG_9762"),
  sector = c(3,3,1,2,5),
  name=c("nhug","hugy","mjuk","ghtr","kuld"),
  'Enternal code'=c(1,1,1,1,3),
  col3=c(1,1,0,0,0),
  col4=c(1,0,0,0,0),
  col5=c(1,0,1,1,1)
)
data2 %>% mutate(
  duplicated = factor(if_else(`ID 1` %in% 
                                pull(data1, `ID 1`),
                              1, 
                              0)))

new to r, now i am looking for a function to mutate my one data frame (data2) like. if I give column names of data1 and data2 to find if the values or string already exist in other data and mutate a new column to 1,0 for true and false.
the function would be like

func(data1 = "name",data2="name",mutated_com="name_exist")

the mutated data frame would be like

External.ID sector col1 Enternal.code col3 col4 col5 duplicate
53265 3 1 1 1 1 1 0
ARV_7362 3 1 1 1 0 0 0
76354 1 0 1 0 0 1 0
IND_2622 2 0 1 0 0 1 1
CHG_9762 5 0 3 0 0 1 1

How's this?

library(dplyr)

data1 <- tibble(
  "ID 1" = c(86364, "ARV_2612", "AGH_2212", "IND_2622", "CHG_2622"),
  sector = c(3, 3, 1, 2, 5),
  name = c("nhug", "hugy", "mjuk", "ghtr", "kuld"),
  "Enternal code" = c(1, 1, 1, 1, 3),
  col3 = c(1, 1, 0, 0, 0),
  col4 = c(1, 0, 0, 0, 0),
  col5 = c(1, 0, 1, 1, 1)
)

data2 <- tibble(
  "ID 1" = c(86364, "ARV_7362", 76354, "IND_2622", "CHG_9762"),
  sector = c(3, 3, 1, 2, 5),
  name = c("nhug", "hugy", "mjuk", "ghtr", "kulg"), # edited to help with testing
  "Enternal code" = c(1, 1, 1, 1, 3),
  col3 = c(1, 1, 0, 0, 0),
  col4 = c(1, 0, 0, 0, 0),
  col5 = c(1, 0, 1, 1, 1)
)

check_duplicated <- function(df1, df2, check_var) {
  check_against <- df1 %>%
    dplyr::pull(check_var)

  dplyr::if_else(         # base ifelse is fine but if_else is supposed to be quicker
    dplyr::pull(dplyr::rowwise(df2), check_var) %in% check_against,
    1, 0
  )
}

check_vars <- c("ID 1", "name")
data2 %>%
  select(1:3) %>%         # just to help final output display fully
  dplyr::mutate(across(
    all_of(check_vars),
    ~ check_duplicated(
      df1 = data1,
      df2 = data2,
      check_var = cur_column()
    ),
    .names = "{.col}_duplicated"
  ))
#> # A tibble: 5 x 5
#>   `ID 1`   sector name  `ID 1_duplicated` name_duplicated
#>   <chr>     <dbl> <chr>             <dbl>           <dbl>
#> 1 86364         3 nhug                  1               1
#> 2 ARV_7362      3 hugy                  0               1
#> 3 76354         1 mjuk                  0               1
#> 4 IND_2622      2 ghtr                  1               1
#> 5 CHG_9762      5 kulg                  0               0

Created on 2020-09-24 by the reprex package (v0.3.0)

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.