Determine overlap between two dataframes based on identifier and value

Hello,

I want to compare df_a against df_b and determine the overlap. The name identifies the product while the value determines the number you have. Summing the values add to 75 each. I want to calculate the overlap between df_a and df_b but it needs to be weighted as even though both sets contain e the value might be 2 vs 3 (which means that overlap is not 100% identical). Where both sets contain say f and the value is in both cases 4 it means that the overlap there is identical.

Is there a quick way to measure overlap or a package or something that can help? Do note that both sets do not contain all the elements nor are of the same length.


df_a <-
  data.frame(
  stringsAsFactors = FALSE,
                  name = c("a","b","c","d","e",
                         "f","g","h","i","j","k","l","m","n","o","p",
                         "q","r","s","t","u","v","w","x","y","z","aa",
                         "bb","cc"),
                  value = c(1,4,4,4,1,1,1,
                         1,1,2,4,1,3,2,3,3,4,2,2,4,4,
                         3,4,4,1,3,3,2,3)
  )

  
df_b <-
data.frame(
  stringsAsFactors = FALSE,
                name = c("e","f","g","h","i","j",
                       "k","l","m","n","o","p","q","r","s","t","u","v",
                       "w","x","y","z","aa","bb","cc","dd","ee"),
                value = c(1,4,4,4,1,4,1,4,
                       1,2,3,1,3,2,3,4,1,2,2,3,4,3,
                       4,4,4,3,3)
)



sum(df_a$value)
#> [1] 75
sum(df_b$value)
#> [1] 75

Created on 2021-04-24 by the reprex package (v0.3.0)

Can you say more about exactly how the overlap is calculated? It's not clear.

Hi :slight_smile:

I found a solution with vecsets to sort it.

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'dplyr' was built under R version 4.0.5
library(vecsets)
#> Warning: package 'vecsets' was built under R version 4.0.5

df_a <-
  data.frame(
    stringsAsFactors = FALSE,
    name = c("a","b","c","d","e",
             "f","g","h","i","j","k","l","m","n","o","p",
             "q","r","s","t","u","v","w","x","y","z","aa",
             "bb","cc"),
    value = c(1,4,4,4,1,1,1,
              1,1,2,4,1,3,2,3,3,4,2,2,4,4,
              3,4,4,1,3,3,2,3)
  )


df_b <-
  data.frame(
    stringsAsFactors = FALSE,
    name = c("e","f","g","h","i","j",
             "k","l","m","n","o","p","q","r","s","t","u","v",
             "w","x","y","z","aa","bb","cc","dd","ee"),
    value = c(1,4,4,4,1,4,1,4,
              1,2,3,1,3,2,3,4,1,2,2,3,4,3,
              4,4,4,3,3)
  )



sum(df_a$value)
#> [1] 75
sum(df_b$value)
#> [1] 75

set_a <- rep(df_a$name,df_a$value)
set_a
#>  [1] "a"  "b"  "b"  "b"  "b"  "c"  "c"  "c"  "c"  "d"  "d"  "d"  "d"  "e"  "f" 
#> [16] "g"  "h"  "i"  "j"  "j"  "k"  "k"  "k"  "k"  "l"  "m"  "m"  "m"  "n"  "n" 
#> [31] "o"  "o"  "o"  "p"  "p"  "p"  "q"  "q"  "q"  "q"  "r"  "r"  "s"  "s"  "t" 
#> [46] "t"  "t"  "t"  "u"  "u"  "u"  "u"  "v"  "v"  "v"  "w"  "w"  "w"  "w"  "x" 
#> [61] "x"  "x"  "x"  "y"  "z"  "z"  "z"  "aa" "aa" "aa" "bb" "bb" "cc" "cc" "cc"
set_b <- rep(df_b$name,df_b$value)
set_b
#>  [1] "e"  "f"  "f"  "f"  "f"  "g"  "g"  "g"  "g"  "h"  "h"  "h"  "h"  "i"  "j" 
#> [16] "j"  "j"  "j"  "k"  "l"  "l"  "l"  "l"  "m"  "n"  "n"  "o"  "o"  "o"  "p" 
#> [31] "q"  "q"  "q"  "r"  "r"  "s"  "s"  "s"  "t"  "t"  "t"  "t"  "u"  "v"  "v" 
#> [46] "w"  "w"  "x"  "x"  "x"  "y"  "y"  "y"  "y"  "z"  "z"  "z"  "aa" "aa" "aa"
#> [61] "aa" "bb" "bb" "bb" "bb" "cc" "cc" "cc" "cc" "dd" "dd" "dd" "ee" "ee" "ee"

a_diff <- 
  vsetdiff(set_a,set_b, multiple = TRUE) %>% length()

b_diff <-
  vsetdiff(set_b,set_a, multiple = TRUE) %>% length()

total_diff <- a_diff + b_diff

total_agreement_unduplicated <-
  vintersect(set_a,set_b, multiple = TRUE)  %>% length()
total_agreement_unduplicated
#> [1] 47

similarity <- (length(set_a) + length(set_b) - total_diff)/ (length(set_a) + length(set_b)) * 100
similarity
#> [1] 62.66667

Created on 2021-04-24 by the reprex package (v0.3.0)

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.