Unfortunately the original post has been removed.

However, this is an excellent opportunity for those new to RStudio to get a glimpse as to what kind of work is needed to make sense of data with poor architecture, specifically in the collection of data.

This specific instance illustrates the steps that I would take to reshape data with the desire to know how many [Codes] were listed in the table provided.

Let me know what you think.


Data

library(data.table)

dt <- data.table(
  Source_1 = LETTERS[1:7],
  Code_1 = c(
    "100;101","", "101;102;103", "103;104",
    "100;102", "101;102", "102;103;104"
  ),
  Source_2 = LETTERS[1:7],
  Code_2 = c(
    "100;101", "100;102", "101", "102;103;104",
    "", "101;102;103;104", "102;103;105"
  )
)
   Source_1      Code_1 Source_2          Code_2
1:        A     100;101        A         100;101
2:        B                    B         100;102
3:        C 101;102;103        C             101
4:        D     103;104        D     102;103;104
5:        E     100;102        E                
6:        F     101;102        F 101;102;103;104
7:        G 102;103;104        G     102;103;105

Transforming

Extract, Define and Append

dt_1 <-
  rbind(dt[, 1:2][, .(Source = Source_1,
                      Code = Code_1,
                      Table = 1)],
        dt[, 3:4][, .(Source = Source_2,
                      Code = Code_2,
                      Table = 2)])
    Source            Code Table
 1:      A         100;101     1
 2:      B                     1
 3:      C     101;102;103     1
 4:      D         103;104     1
 5:      E         100;102     1
 6:      F         101;102     1
 7:      G     102;103;104     1
 8:      A         100;101     2
 9:      B         100;102     2
10:      C             101     2
11:      D     102;103;104     2
12:      E                     2
13:      F 101;102;103;104     2
14:      G     102;103;105     2

Split [Code]

dt_1[, letters[1:4] := tstrsplit(Code, ";", fixed = FALSE)][, Code := NULL]
    Source Table    a    b    c    d
 1:      A     1  100  101 <NA> <NA>
 2:      B     1 <NA> <NA> <NA> <NA>
 3:      C     1  101  102  103 <NA>
 4:      D     1  103  104 <NA> <NA>
 5:      E     1  100  102 <NA> <NA>
 6:      F     1  101  102 <NA> <NA>
 7:      G     1  102  103  104 <NA>
 8:      A     2  100  101 <NA> <NA>
 9:      B     2  100  102 <NA> <NA>
10:      C     2  101 <NA> <NA> <NA>
11:      D     2  102  103  104 <NA>
12:      E     2 <NA> <NA> <NA> <NA>
13:      F     2  101  102  103  104
14:      G     2  102  103  105 <NA>

Reshape and cut

dt_2 <- unique(melt(
  dt_1,
  id.vars = c("Source", "Table"),
  value.name = "Code"
))[!is.na(Code)][, variable := NULL]
    Source Table Code
 1:      A     1  100
 2:      C     1  101
 3:      D     1  103
 4:      E     1  100
 5:      F     1  101
 6:      G     1  102
 7:      A     2  100
 8:      B     2  100
 9:      C     2  101
10:      D     2  102
11:      F     2  101
12:      G     2  102
13:      A     1  101
14:      C     1  102
15:      D     1  104
16:      E     1  102
17:      F     1  102
18:      G     1  103
19:      A     2  101
20:      B     2  102
21:      D     2  103
22:      F     2  102
23:      G     2  103
24:      C     1  103
25:      G     1  104
26:      D     2  104
27:      F     2  103
28:      G     2  105
29:      F     2  104
    Source Table Code

Create variable [Count] and group by [Code]

dt_3 <- dt_2[, .(Count = .N), .(Code)]

Desired Result

   Code Count
1:  100     4
2:  101     6
3:  103     6
4:  102     8
5:  104     4
6:  105     1

Use data.table::rbindlist() in place of rbind().

1 Like

That's an awesome suggestion.

It had not occurred to me that data.table had a faster function for rbind().

They should probably update their cheat sheet if they haven't done so already.