search and categorize

PalomaLlorente · December 6, 2019, 2:40pm

I need to find info from a data.frame into another data.frame. The idea is to see if the info is included in both data.frames and if not detect that. To expose my issue I have created a short reprex:

segment_name<-data.frame(stringsAsFactors=FALSE,
Segment_1 = c("XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT",
"XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", NA, NA),
Segment_2 = c("ZAG_VBA", "ZMR_ORBIS", "ZWN_PIREK", "VERCE_MLP.A",
"VERED_BMN.A", "VERED_WSN", "VEREV_GIMEX", NA, NA)
)
conflict<- data.frame(stringsAsFactors=FALSE,
Conflict_1 = c("XILTI_ARTAT", "VEREV_PITAL", "XILTI_ARTAT", "IFDPS_DFTGS",
"ASGSH_SADCU", "XILTI_ARTAT", "ASDCI_ASDED", "FOFNV_VSDFO",
"VERCE_MLP.A"),
Conflict_2 = c("VEREV_GIMEX", "VKWES_ACOFS", "ZAG_VBA", "SCVVR_VLOCS",
"BFDND_VLOSN", "VERED_BMN.A", "QFDHS_DCVSO", "SDVSJ_SDCVS",
"XILTI_ARTAT")
)
result<- data.frame(stringsAsFactors=FALSE,
Conflict_1 = c("XILTI_ARTAT", "VEREV_PITAL", "XILTI_ARTAT",
"IFDPS_DFTGS", "ASGSH_SADCU", "XILTI_ARTAT",
"ASDCI_ASDED", "FOFNV_VSDFO", "VERCE_MLP.A"),
Conflict_2 = c("VEREV_GIMEX", "VKWES_ACOFS", "ZAG_VBA", "SCVVR_VLOCS",
"BFDND_VLOSN", "VERED_BMN.A", "QFDHS_DCVSO",
"SDVSJ_SDCVS", "XILTI_ARTAT"),
CATEGORIZATION = c("FOUND", "NOT FOUND", "FOUND", "NOT FOUND", "NOT FOUND",
"FOUND", "NOT FOUND", "NOT FOUND", "FOUND")
)

Info from the first two columns of both data.frames are ligated and should be analyzed together, that is to say:
Segment_1+Segment_2 = XILTI_ARTAT + ZAG_VBA
This combination is the one that should be searched into the data.frame "conflict", whose two columns are also ligated: Conflict_1+Conflict_2.
Important to note that combinations should be searched also in an opposite way, that is to say for the example above: Segment_1+Segment_2 = XILTI_ARTAT + ZAG_VBA is the same combination that Segment_2+Segment_1 = ZAG_VBA + XILTI_ARTAT and therefore, both possibilities should be searched in the data.frame "conflict".
The solution is presented in the 3rd data.frame, "result". There, combination Segment1+Segment2 was searched in Conflict1+Conflict2. As can be seen in the column "CATEGORIZATION", combinations were identified as "Found" and "Not Found". Finally, "Founds" were identified in another column, also showed in the reprex.

FJCC · December 6, 2019, 3:26pm

I did not consider the case of an entry being found more than once.

segment_name<-data.frame(stringsAsFactors=FALSE,
                         Segment_1 = c("XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT",
                                       "XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", NA, NA),
                         Segment_2 = c("ZAG_VBA", "ZMR_ORBIS", "ZWN_PIREK", "VERCE_MLP.A",
                                       "VERED_BMN.A", "VERED_WSN", "VEREV_GIMEX", NA, NA)
)
conflict<- data.frame(stringsAsFactors=FALSE,
                      Conflict_1 = c("XILTI_ARTAT", "VEREV_PITAL", "XILTI_ARTAT", "IFDPS_DFTGS",
                                     "ASGSH_SADCU", "XILTI_ARTAT", "ASDCI_ASDED", "FOFNV_VSDFO",
                                     "VERCE_MLP.A"),
                      Conflict_2 = c("VEREV_GIMEX", "VKWES_ACOFS", "ZAG_VBA", "SCVVR_VLOCS",
                                     "BFDND_VLOSN", "VERED_BMN.A", "QFDHS_DCVSO", "SDVSJ_SDCVS",
                                     "XILTI_ARTAT")
)
result<- data.frame(stringsAsFactors=FALSE,
                    Conflict_1 = c("XILTI_ARTAT", "VEREV_PITAL", "XILTI_ARTAT",
                                   "IFDPS_DFTGS", "ASGSH_SADCU", "XILTI_ARTAT",
                                   "ASDCI_ASDED", "FOFNV_VSDFO", "VERCE_MLP.A"),
                    Conflict_2 = c("VEREV_GIMEX", "VKWES_ACOFS", "ZAG_VBA", "SCVVR_VLOCS",
                                   "BFDND_VLOSN", "VERED_BMN.A", "QFDHS_DCVSO",
                                   "SDVSJ_SDCVS", "XILTI_ARTAT"),
                    CATEGORIZATION = c("FOUND", "NOT FOUND", "FOUND", "NOT FOUND", "NOT FOUND",
                                       "FOUND", "NOT FOUND", "NOT FOUND", "FOUND")
)
Found <- vector("numeric", length = nrow(segment_name))
for (i in 1:nrow(segment_name)){
  Result <- which((segment_name[2, 1] == conflict[,1] & segment_name[i, 2] == conflict[,2]) |
    (segment_name[i, 1] == conflict[,2] & segment_name[i, 2] == conflict[,1]))
  if (length(Result) > 0) Found[i] <- Result
}
Found <- Found[Found > 0]
conflict$CATEGORIZATION <- "NOT FOUND"
conflict[Found, "CATEGORIZATION"] <- "FOUND"
conflict
#>    Conflict_1  Conflict_2 CATEGORIZATION
#> 1 XILTI_ARTAT VEREV_GIMEX          FOUND
#> 2 VEREV_PITAL VKWES_ACOFS      NOT FOUND
#> 3 XILTI_ARTAT     ZAG_VBA          FOUND
#> 4 IFDPS_DFTGS SCVVR_VLOCS      NOT FOUND
#> 5 ASGSH_SADCU BFDND_VLOSN      NOT FOUND
#> 6 XILTI_ARTAT VERED_BMN.A          FOUND
#> 7 ASDCI_ASDED QFDHS_DCVSO      NOT FOUND
#> 8 FOFNV_VSDFO SDVSJ_SDCVS      NOT FOUND
#> 9 VERCE_MLP.A XILTI_ARTAT          FOUND
result
#>    Conflict_1  Conflict_2 CATEGORIZATION
#> 1 XILTI_ARTAT VEREV_GIMEX          FOUND
#> 2 VEREV_PITAL VKWES_ACOFS      NOT FOUND
#> 3 XILTI_ARTAT     ZAG_VBA          FOUND
#> 4 IFDPS_DFTGS SCVVR_VLOCS      NOT FOUND
#> 5 ASGSH_SADCU BFDND_VLOSN      NOT FOUND
#> 6 XILTI_ARTAT VERED_BMN.A          FOUND
#> 7 ASDCI_ASDED QFDHS_DCVSO      NOT FOUND
#> 8 FOFNV_VSDFO SDVSJ_SDCVS      NOT FOUND
#> 9 VERCE_MLP.A XILTI_ARTAT          FOUND

^{Created on 2019-12-06 by the reprex package (v0.3.0.9000)}

andresrcs · December 6, 2019, 4:15pm

Another approach

library(tidyverse)

segment_name <- data.frame(stringsAsFactors=FALSE,
                         Segment_1 = c("XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT",
                                       "XILTI_ARTAT", "XILTI_ARTAT", "XILTI_ARTAT", NA, NA),
                         Segment_2 = c("ZAG_VBA", "ZMR_ORBIS", "ZWN_PIREK", "VERCE_MLP.A",
                                       "VERED_BMN.A", "VERED_WSN", "VEREV_GIMEX", NA, NA)
)
conflict <- data.frame(stringsAsFactors=FALSE,
                      Conflict_1 = c("XILTI_ARTAT", "VEREV_PITAL", "XILTI_ARTAT", "IFDPS_DFTGS",
                                     "ASGSH_SADCU", "XILTI_ARTAT", "ASDCI_ASDED", "FOFNV_VSDFO",
                                     "VERCE_MLP.A"),
                      Conflict_2 = c("VEREV_GIMEX", "VKWES_ACOFS", "ZAG_VBA", "SCVVR_VLOCS",
                                     "BFDND_VLOSN", "VERED_BMN.A", "QFDHS_DCVSO", "SDVSJ_SDCVS",
                                     "XILTI_ARTAT")
)
conflict %>% 
    mutate(Categorization = map(paste0(Conflict_1, Conflict_2, "|", Conflict_2, Conflict_1),
                                ~ if_else(any(str_detect(paste0(segment_name$Segment_1, segment_name$Segment_2), pattern = .x)),
                                          true = "FOUND", 
                                          false = "NOT FOUND")))
#>    Conflict_1  Conflict_2 Categorization
#> 1 XILTI_ARTAT VEREV_GIMEX          FOUND
#> 2 VEREV_PITAL VKWES_ACOFS      NOT FOUND
#> 3 XILTI_ARTAT     ZAG_VBA          FOUND
#> 4 IFDPS_DFTGS SCVVR_VLOCS      NOT FOUND
#> 5 ASGSH_SADCU BFDND_VLOSN      NOT FOUND
#> 6 XILTI_ARTAT VERED_BMN.A          FOUND
#> 7 ASDCI_ASDED QFDHS_DCVSO      NOT FOUND
#> 8 FOFNV_VSDFO SDVSJ_SDCVS      NOT FOUND
#> 9 VERCE_MLP.A XILTI_ARTAT          FOUND

PalomaLlorente · December 10, 2019, 6:29pm

Hi!
First of all, thank you for your help.
I have just tried the code you share with me and unfortunately, a message appears in the console: "Error in exists(dbname) : variable names are limited to 10000 bytes". I need to compare around 35000 rows, I don´t know if this message is about that; if so, could you think about another option? "FJCC" code apparently works but I would like to have another code just to compare solutions and be sure about it. Thanks in advance!

andresrcs · December 13, 2019, 1:37am

This error message is odd, I can't think of a reason for my code to produce it, could you provide a reproducible example for this issue?

PalomaLlorente · December 13, 2019, 3:43pm

I couldn´t properly generate a reprex, hope this works. When running @andresrcs code, the error that appears is: "Error in parse(text = dbname) : :1:7: unexpected '!'
1: maps::!", when running the real data.frames where I have about 6470 rows, another error appears ("Error in exists(dbname) : variable names are limited to 10000 bytes"). When running @FJCC code, it runs but shows a wrong answer as it can be seen in the example. Then apparently none of the codes work yet.
♀
segments<- data.frame(stringsAsFactors=FALSE,
Seg_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
"ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM", "ARTAT_XILTI",
"ARTAT_XILTI"),
Seg_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
"RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM",
"PODAN_XIMBA", "POLON_XIMBA")
)
conflicts<- data.frame(stringsAsFactors=FALSE,
Conf_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
"ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM"),
Conf_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
"RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM")
)

conflicts %>%
mutate(Categorization = map(paste0(Conf_1, Conf_2, "|", Conf_2, Conf_1),
~ if_else(any(str_detect(paste0(segments$Seg_1, segments$Seg_2), pattern = .x)),
true = "FOUND",
false = "NOT FOUND")))

Found <- vector("numeric", length = nrow(segments))
for (i in 1:nrow(segments)){
Result <- which((segments[2, 1] == conflicts[,1] & segments[i, 2] == conflicts[,2]) |
(segments[i, 1] == conflicts[,2] & segments[i, 2] == conflicts[,1]))
if (length(Result) > 0) Found[i] <- Result
}
Found <- Found[Found > 0]
conflicts$CATEGORIZATION <- "NOT FOUND"
conflicts[Found, "CATEGORIZATION"] <- "FOUND"
data.frame(stringsAsFactors=FALSE,
Conf_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
"ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM"),
Conf_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
"RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM"),
CATEGORIZATION = c("NOT FOUND", "FOUND", "NOT FOUND", "NOT FOUND",
"NOT FOUND", "NOT FOUND", "FOUND")
)

I would really appreciate if someone could help me with this. Thanks in advance

FJCC · December 13, 2019, 4:15pm

There was a typo in my code. Does this give you what you expect?

segments<- data.frame(stringsAsFactors=FALSE,
                      Seg_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
                                "ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM", "ARTAT_XILTI",
                                "ARTAT_XILTI"),
                      Seg_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
                                "RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM",
                                "PODAN_XIMBA", "POLON_XIMBA")
)
conflicts<- data.frame(stringsAsFactors=FALSE,
                       Conf_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
                                  "ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM"),
                       Conf_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
                                  "RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM")
)
Found <- vector("numeric", length = nrow(segments))
for (i in 1:nrow(segments)){
  Result <- which((segments[i, 1] == conflicts[,1] & segments[i, 2] == conflicts[,2]) |
                    (segments[i, 1] == conflicts[,2] & segments[i, 2] == conflicts[,1]))
  if (length(Result) > 0) Found[i] <- Result
}
Found <- Found[Found > 0]
conflicts$CATEGORIZATION <- "NOT FOUND"
conflicts[Found, "CATEGORIZATION"] <- "FOUND"
conflicts
#>        Conf_1      Conf_2 CATEGORIZATION
#> 1  !DHqI_*PUT  !BznO_*PUT          FOUND
#> 2    *WHA_RBV  !DJxO_*WHA          FOUND
#> 3 ARTAT_XILTI   PIREK_ZWN          FOUND
#> 4 !BXhV_!BXhW !CiJD_!CiJE          FOUND
#> 5 ARTAT_XILTI RITAG_XILTI          FOUND
#> 6   SIN_SUNUM !CNit_ATNOP          FOUND
#> 7 LIVDO_ULKEM LIVDO_ULKEM          FOUND

^{Created on 2019-12-13 by the reprex package (v0.3.0.9000)}

andresrcs · December 14, 2019, 2:58am

Ok, my code isn't working because you have a special character in your data (i.e.*), and that interferes with the regular expressions, but there is a way around this issue:

library(tidyverse)

segments<- data.frame(stringsAsFactors=FALSE,
                      Seg_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
                                "ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM", "ARTAT_XILTI",
                                "ARTAT_XILTI"),
                      Seg_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
                                "RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM",
                                "PODAN_XIMBA", "POLON_XIMBA")
)

conflicts<- data.frame(stringsAsFactors=FALSE,
                       Conf_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
                                  "ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM"),
                       Conf_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
                                  "RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM")
)

conflicts %>%
    mutate_at(vars(starts_with("Conf")), str_replace_all, pattern = "\\*", replacement = "\\\\*") %>% 
    mutate(Categorization = map(paste0(Conf_1, Conf_2, "|", Conf_2, Conf_1),
                                ~ if_else(any(str_detect(paste0(segments$Seg_1, segments$Seg_2), pattern = .x)),
                                          true = "FOUND",
                                          false = "NOT FOUND")))
#>         Conf_1       Conf_2 Categorization
#> 1 !DHqI_\\*PUT !BznO_\\*PUT          FOUND
#> 2   \\*WHA_RBV !DJxO_\\*WHA          FOUND
#> 3  ARTAT_XILTI    PIREK_ZWN          FOUND
#> 4  !BXhV_!BXhW  !CiJD_!CiJE          FOUND
#> 5  ARTAT_XILTI  RITAG_XILTI          FOUND
#> 6    SIN_SUNUM  !CNit_ATNOP          FOUND
#> 7  LIVDO_ULKEM  LIVDO_ULKEM          FOUND

PalomaLlorente · December 14, 2019, 11:58am

@andresrcs, I have just try your code and unfortunately another error message appeared :"Error in UseMethod("tbl_vars") : no applicable method for 'tbl_vars' applied to an object of class "quosures"
I don´t know what it is about, but in order to prevent other errors, I have split each column in separate data.frames to anylse them; that is to say:

Seg_1 <- data.frame(stringsAsFactors=FALSE,
Seg_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
"ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM", "ARTAT_XILTI",
"ARTAT_XILTI"))
Seg_2 <- data.frame(stringsAsFactors=FALSE,
Seg_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
"RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM",
"PODAN_XIMBA", "POLON_XIMBA"))
Conf_1 <- data.frame(stringsAsFactors=FALSE,
Conf_1 = c("!DHqI_*PUT", "*WHA_RBV", "ARTAT_XILTI", "!BXhV_!BXhW",
"ARTAT_XILTI", "SIN_SUNUM", "LIVDO_ULKEM"))
Conf_2 <- data.frame(stringsAsFactors=FALSE,
Conf_2 = c("!BznO_*PUT", "!DJxO_*WHA", "PIREK_ZWN", "!CiJD_!CiJE",
"RITAG_XILTI", "!CNit_ATNOP", "LIVDO_ULKEM"))

andresrcs · December 14, 2019, 2:14pm

It is weird that you are getting these error messages. If you run my code on a clean R session you should be getting the exact same result (that is the purpose of reprex package). Regardless of my proposed solution, it would be better if we could work on a similar environment so, could you post your session info after updating your packages? please follow these steps.

Restart your R session Ctrl+Shift+F10
Update your packages

update.packages(ask=FALSE, checkBuilt=TRUE)

Load tidyverse package and get session info so we can compare versions.

library(tidyverse)
sessionInfo()
#> R version 3.6.1 (2019-07-05)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
#> 
#> locale:
#>  [1] LC_CTYPE=es_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=es_US.UTF-8        LC_COLLATE=es_US.UTF-8    
#>  [5] LC_MONETARY=es_US.UTF-8    LC_MESSAGES=es_US.UTF-8   
#>  [7] LC_PAPER=es_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=es_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.3    
#> [5] readr_1.3.1     tidyr_1.0.0     tibble_2.1.3    ggplot2_3.2.1  
#> [9] tidyverse_1.3.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_0.2.5  xfun_0.11         haven_2.2.0       lattice_0.20-38  
#>  [5] colorspace_1.4-1  vctrs_0.2.0       generics_0.0.2    htmltools_0.4.0  
#>  [9] yaml_2.2.0        rlang_0.4.2       pillar_1.4.2      withr_2.1.2      
#> [13] glue_1.3.1        DBI_1.0.0         dbplyr_1.4.2      modelr_0.1.5     
#> [17] readxl_1.3.1      lifecycle_0.1.0   munsell_0.5.0     gtable_0.3.0     
#> [21] cellranger_1.1.0  rvest_0.3.5       evaluate_0.14     knitr_1.26       
#> [25] fansi_0.4.0       highr_0.8         broom_0.5.2       Rcpp_1.0.3       
#> [29] backports_1.1.5   scales_1.1.0      jsonlite_1.6      fs_1.3.1         
#> [33] hms_0.5.2         digest_0.6.23     stringi_1.4.3     grid_3.6.1       
#> [37] cli_2.0.0         tools_3.6.1       magrittr_1.5      lazyeval_0.2.2   
#> [41] crayon_1.3.4      pkgconfig_2.0.3   zeallot_0.1.0     xml2_1.2.2       
#> [45] reprex_0.3.0.9000 lubridate_1.7.4   assertthat_0.2.1  rmarkdown_2.0    
#> [49] httr_1.4.1        R6_2.4.1          nlme_3.1-143      compiler_3.6.1

system · January 4, 2020, 2:14pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.