Intersection of common words across four df columns?

mckay.todd · May 6, 2021, 4:38pm

Hi all,

I am trying to create a master word (i.e., vocabulary) list from 4 separate lists. Words on the 4 separate lists are derived from 4 different corpora of medical school texts. Within each list, words have been ranked according to their relative frequencies. I would like to create a master list that is a subset or intersection of only words that appear on all 4 lists.

Right now, my data look something like this, but each list is in its own tibble. How do I create a subset of only the common words without using one of the lists as a "reference" list?

Thank you!

library(tidyverse)

df <- tibble(
List1 = OpenRepGrid::randomWords(3000),
List2 = OpenRepGrid::randomWords(3000),
List3 = OpenRepGrid::randomWords(3000),
List4 = OpenRepGrid::randomWords(3000)
)

df

technocrat · May 6, 2021, 10:22pm

This is a problem in the natural language processing (NLP) domain, which has tools for analysis of text. Among the tools available is the {tidytext} package. The specific problem can also be addressed with more general tools.

Without something approaching a reprex, it is not feasible to provide directed advice. (See the FAQ: How to do a minimal reproducible example reprex for beginners In particular, I don't know if the lists have been filtered to remove stopwords, such as the, a, it, is $\dots which are likely to be at the top of each of the lists and provide no useful information on which to distinguish the lists.

Here's one approach

suppressPackageStartupMessages({
  library(dplyr)
  library(rcorpora)
  library(tidytext)
})

corp <- data.frame(diagnoses = corpora("medicine/diagnoses")$codes$desc)

corp %>% unnest_tokens(word, diagnoses) -> raw_words
my_stops <- data.frame(word = raw_words[raw_words$word %in% stop_words$word,])
my_stops %>% count(word) %>% arrange(desc(n))
#>         word   n
#> 1         of 180
#> 2        and  56
#> 3      other  51
#> 4       with  35
#> 5         in  29
#> 6      right  27
#> 7    without  17
#> 8         at  16
#> 9         or  11
#> 10        to  10
#> 11       the   9
#> 12 elsewhere   7
#> 13       end   6
#> 14 specified   6
#> 15      back   5
#> 16     first   5
#> 17      into   4
#> 18     three   4
#> 19        as   3
#> 20     cause   3
#> 21       not   3
#> 22        on   3
#> 23         s   3
#> 24        by   2
#> 25    during   2
#> 26 following   2
#> 27       for   2
#> 28         i   2
#> 29    little   2
#> 30     parts   2
#> 31    second   2
#> 32     small   2
#> 33         t   2
#> 34       two   2
#> 35         a   1
#> 36       all   1
#> 37      area   1
#> 38     being   1
#> 39         c   1
#> 40    except   1
#> 41      face   1
#> 42     large   1
#> 43     later   1
#> 44       non   1
#> 45       off   1
#> 46      open   1
#> 47   outside   1
#> 48      part   1
#> 49     place   1
#> 50     state   1
#> 51     third   1
#> 52        up   1
#> 53       use   1
#> 54      used   1
#> 55   whether   1
keeps <- anti_join(raw_words,my_stops)
#> Joining, by = "word"
keeps %>% count(word) %>% arrange(desc(n)) %>% filter(n > 1) -> common
common
#>                    word  n
#> 1           unspecified 86
#> 2                  left 38
#> 3                   eye 16
#> 4                  hand 15
#> 5                muscle 15
#> 6              fracture 14
#> 7                 level 14
#> 8                 upper 14
#> 9                injury 13
#> 10              injured 12
#> 11           laceration 12
#> 12             shoulder 12
#> 13             accident 10
#> 14                 foot 10
#> 15                  arm  9
#> 16               degree  9
#> 17                  leg  9
#> 18                lower  9
#> 19               tendon  9
#> 20                acute  8
#> 21                 body  8
#> 22            disorders  8
#> 23               fascia  8
#> 24               finger  8
#> 25                  hip  8
#> 26              vehicle  8
#> 27                wrist  8
#> 28              chronic  7
#> 29           classified  7
#> 30            collision  7
#> 31             neoplasm  7
#> 32                ankle  6
#> 33              disease  6
#> 34                  ear  6
#> 35              forearm  6
#> 36              foreign  6
#> 37            involving  6
#> 38                motor  6
#> 39           rheumatoid  6
#> 40                 type  6
#> 41                 wall  6
#> 42           amputation  5
#> 43             anterior  5
#> 44                 bone  5
#> 45                 burn  5
#> 46              contact  5
#> 47               effect  5
#> 48            malignant  5
#> 49           nontraffic  5
#> 50          superficial  5
#> 51               thorax  5
#> 52               tissue  5
#> 53              traffic  5
#> 54            traumatic  5
#> 55            arthritis  4
#> 56                 cell  4
#> 57        complications  4
#> 58              corneal  4
#> 59            corrosion  4
#> 60             diseases  4
#> 61                  due  4
#> 62             extensor  4
#> 63               harris  4
#> 64              humerus  4
#> 65         nondisplaced  4
#> 66            passenger  4
#> 67              physeal  4
#> 68               salter  4
#> 69                sites  4
#> 70          subluxation  4
#> 71                thigh  4
#> 72                ulcer  4
#> 73              wheeled  4
#> 74               artery  3
#> 75                blood  3
#> 76               cavity  3
#> 77             civilian  3
#> 78            contusion  3
#> 79                 cyst  3
#> 80               damage  3
#> 81             disorder  3
#> 82                elbow  3
#> 83               eyelid  3
#> 84           hemorrhage  3
#> 85              induced  3
#> 86                joint  3
#> 87             juvenile  3
#> 88                 limb  3
#> 89  metacarpophalangeal  3
#> 90               middle  3
#> 91             military  3
#> 92             multiple  3
#> 93             myositis  3
#> 94                 nail  3
#> 95                nerve  3
#> 96             occupant  3
#> 97           operations  3
#> 98                organ  3
#> 99         ossification  3
#> 100       osteomyelitis  3
#> 101       osteonecrosis  3
#> 102               pedal  3
#> 103         penetration  3
#> 104              radius  3
#> 105              region  3
#> 106                site  3
#> 107                soft  3
#> 108             tendons  3
#> 109            thoracic  3
#> 110               toxic  3
#> 111           transport  3
#> 112           trimester  3
#> 113            abnormal  2
#> 114               abuse  2
#> 115         antibiotics  2
#> 116             assault  2
#> 117              benign  2
#> 118           bilateral  2
#> 119              breast  2
#> 120            bursitis  2
#> 121                 bus  2
#> 122       calcification  2
#> 123        complication  2
#> 124          connective  2
#> 125          continuity  2
#> 126         contracture  2
#> 127               cycle  2
#> 128              cystic  2
#> 129            delivery  2
#> 130            dementia  2
#> 131             diffuse  2
#> 132              driver  2
#> 133               edema  2
#> 134           encounter  2
#> 135            exposure  2
#> 136            external  2
#> 137          extranodal  2
#> 138             femoral  2
#> 139               femur  2
#> 140              fibula  2
#> 141               fixed  2
#> 142           formation  2
#> 143               front  2
#> 144             genital  2
#> 145               heavy  2
#> 146           hepatitis  2
#> 147                 iii  2
#> 148           including  2
#> 149               index  2
#> 150        inflammatory  2
#> 151           influenza  2
#> 152            inhalant  2
#> 153        interstitial  2
#> 154        intervention  2
#> 155      intervertebral  2
#> 156           intestine  2
#> 157                knee  2
#> 158               labor  2
#> 159               legal  2
#> 160            lymphoma  2
#> 161            meniscus  2
#> 162          metacarpal  2
#> 163             muscles  2
#> 164          myelopathy  2
#> 165         myocarditis  2
#> 166               nasal  2
#> 167                  nk  2
#> 168                nose  2
#> 169              object  2
#> 170               orbit  2
#> 171              organs  2
#> 172          ossificans  2
#> 173          osteophyte  2
#> 174           paralytic  2
#> 175        pathological  2
#> 176          petrositis  2
#> 177             powered  2
#> 178            pressure  2
#> 179          prosthetic  2
#> 180           psychotic  2
#> 181             rupture  2
#> 182           secondary  2
#> 183               sinus  2
#> 184         spondylosis  2
#> 185          stationary  2
#> 186              status  2
#> 187           stimulant  2
#> 188            subacute  2
#> 189             suspect  2
#> 190          syphilitic  2
#> 191              system  2
#> 192               thumb  2
#> 193               tibia  2
#> 194                 toe  2
#> 195                toes  2
#> 196          traumatica  2
#> 197                ulna  2
#> 198               wound  2

set.seed(42)
list1 <- intersect(sample(raw_words$word,455,replace = FALSE),common$word)
list2 <- intersect(sample(raw_words$word,455,replace = FALSE),common$word)



intersect(list1,list2)
#>  [1] "ankle"        "finger"       "leg"          "muscle"       "bus"         
#>  [6] "left"         "nontraffic"   "level"        "neoplasm"     "eye"         
#> [11] "unspecified"  "foreign"      "upper"        "rheumatoid"   "foot"        
#> [16] "benign"       "degree"       "index"        "humerus"      "chronic"     
#> [21] "hip"          "shoulder"     "powered"      "traffic"      "harris"      
#> [26] "amputation"   "fracture"     "type"         "accident"     "sites"       
#> [31] "hemorrhage"   "nondisplaced" "pressure"     "wheeled"      "disease"     
#> [36] "motor"        "elbow"        "thigh"        "corrosion"    "diseases"    
#> [41] "lower"        "diffuse"      "trimester"    "meniscus"     "psychotic"   
#> [46] "rupture"      "disorders"    "ear"          "radius"       "physeal"     
#> [51] "fascia"       "femur"        "nail"         "toe"          "injury"      
#> [56] "knee"         "legal"        "toes"         "contusion"    "eyelid"      
#> [61] "arthritis"    "tibia"        "bone"         "corneal"      "genital"     
#> [66] "traumatic"    "classified"   "disorder"     "arm"          "effect"      
#> [71] "cystic"       "injured"      "transport"    "anterior"     "superficial" 
#> [76] "abuse"        "pathological" "organ"        "penetration"  "hand"        
#> [81] "ossification" "laceration"

system · May 27, 2021, 10:23pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.