# Looking for faster solution

Hello everybody, I'm looking for a faster solution to the problem below. The dataset contains more than 7M rows, and I have no idea how to make it faster. Thank you!

``````library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.1.2
df<- tribble(~A,~B,~C,~D,
"I123","I121","I1908","I129",
"I128","I123","I124","I109",
"I126","I1855","I129","I183",
"I121","I163","F121","I8773",
"I123","I129","I1563","I121",
"I129","I1665","I128", "F843",
"X","Y","Z","ZZ")

df <- df |> mutate(Row=row_number())
Long <- df |> pivot_longer(cols = A:D,names_to = "name")
Long <-  Long |> group_by(Row) |>
summarize(C1 = any(str_detect(value,"I123|I128")),
C2 = any(str_detect(value,"I121")),
C3 = any(str_detect(value,"I129"))) |>
rowwise() |>
mutate(WHICH=which(c_across(C1:C3)))
FINAL <- inner_join(df,Long,by="Row")
FINAL``````

For larger datasets, I'd recommend the `data.table` package.

I'm currently looking at how you shaped the data.

@pavel , Does the [WHICH] column function as intended?

Here's my results based on my current assumption of [WHICH].

## Code

``````# df -- Generate large dataset in data.table; create [Row] variable
dt_a <- data.table(
A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10^6),
B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10^6),
C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10^6),
D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10^6)
)[, Row := 1:.N]

# Long -- Reshape to long format; create [WHICH] variable based on conditionals
dt_b <-
melt(
dt_a,
id.vars = "Row",
variable.name = "name",
value.name = "value"
)[value == "I123" |
value == "I128" |
value == "I121" |
value == "I129", WHICH := 1
][,
.(
C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
C2 = as.logical(sum(value == "I121")),
C3 = as.logical(sum(value == "I129")),
WHICH = sum(WHICH, na.rm = TRUE)
), Row]

# FINAL -- Join tables; output
dt_c <- dt_a[dt_b, on = .(Row = Row)]
dt_c
``````
``````            A     B     C     D     Row    C1    C2    C3 WHICH
1: I123  I121 I1908  I129       1  TRUE  TRUE  TRUE     3
2: I128  I123  I124  I109       2  TRUE FALSE FALSE     2
3: I126 I1855  I129  I183       3 FALSE FALSE  TRUE     1
4: I121  I163  F121 I8773       4 FALSE  TRUE FALSE     1
5: I123  I129 I1536  I121       5  TRUE  TRUE  TRUE     3
---
6999996: I126 I1855  I129  I183 6999996 FALSE FALSE  TRUE     1
6999997: I121  I163  F121 I8773 6999997 FALSE  TRUE FALSE     1
6999998: I123  I129 I1536  I121 6999998  TRUE  TRUE  TRUE     3
6999999: I129 I1665  I128  F843 6999999  TRUE FALSE  TRUE     2
7000000:    X     Y     Z    ZZ 7000000 FALSE FALSE FALSE     0
``````

## Time Report

``````# df
rt_a <- system.time(data.table(
A = rep(c("I123", "I128", "I126", "I121", "I123", "I129", "X"), 10 ^ 6),
B = rep(c("I121", "I123", "I1855", "I163", "I129", "I1665", "Y"), 10 ^ 6),
C = rep(c("I1908", "I124", "I129", "F121", "I1536", "I128", "Z"), 10 ^ 6),
D = rep(c("I129", "I109", "I183", "I8773", "I121", "F843", "ZZ"), 10 ^ 6)
)[, Row := 1:.N])
rt_a
``````
``````   user  system elapsed
0.13    0.00    0.12
``````
``````# Long -- This process uses the most memory
rt_b <- system.time(
melt(dt_a, id.vars = "Row", variable.name = "name", value.name = "value")[
value == "I123" | value == "I128" | value == "I121" | value == "I129",
WHICH := 1
][,
.(
C1 = as.logical(sum(value == "I123") + sum(value == "I128")),
C2 = as.logical(sum(value == "I121")),
C3 = as.logical(sum(value == "I129")),
WHICH = sum(WHICH, na.rm = TRUE)
), Row])
rt_b
``````
``````   user  system elapsed
16.88    0.36   16.84
``````
``````# FINAL
rt_c <- system.time(dt_a[dt_b, on = .(Row = Row)])
rt_c
``````
``````   user  system elapsed
0.42    0.00    0.28
``````

Forgive the multiple edits; doing dumb things while trying to optimize the already boiler-plate code.

1 Like

I would do something along these lines:

1. make the table long
2. join with a table that maps your values to your "c number"
3. get the minimal "c number" per row
``````library(tidyverse)
df<- tribble(~A,~B,~C,~D,
"I123","I121","I1908","I129",
"I128","I123","I124","I109",
"I126","I1855","I129","I183",
"I121","I163","F121","I8773",
"I123","I129","I1563","I121",
"I129","I1665","I128", "F843",
"X","Y","Z","ZZ")

df <- df |> mutate(row = row_number())
long <- df |> pivot_longer(cols = A:D, names_to = "name")

mapping_tbl <- tribble(
~ C_number, ~ value,
1,  "I123",
1,  "I128",
2,  "I121",
3,  "I129",
)

row_results <- long %>%
inner_join(mapping_tbl, by = "value") %>%
group_by(row) %>%
summarise(which = min(C_number))
``````

Created on 2022-07-21 by the reprex package (v2.0.1)

1 Like

@Ven Thank you very much for your reply, sorry, I haven't had a chance to test the solution, I'm busy with my regular job. Thank you again.

@mgirlich Thank you! Again, I will test it as soon as possible.

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.