Given ranges per group, assign values to the groups if within those ranges using dplyr

I have these two given dataframes:

df_r_ranges = data.frame(
  stringsAsFactors = FALSE,
          GROUP_BY = c(NA, NA, NA, NA, NA),
               R_Q = c("4", "1", "3", "2", "5"),
           R_RANGE = c("591-1161","1882-2303",
                       "1162-1527","1528-1878","0-590"),
             R_MIN = c(591, 1882, 1162, 1528, 0),
             R_MAX = c(1161, 2303, 1527, 1878, 590)
)

df_r_values = data.frame(
  stringsAsFactors = FALSE,
                ID = c("1234567","2345678","3456789",
                       "4567890","7654321","8765432","9875643","0987654",
                       "1029384","2938475"),
          GROUP_BY = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
           R_VALUE = c(9L, 28L, 17L, 605L, 154L, 0L, 596L, 549L, 3L, 0L)
)

(Please ignore these ID values :))

If the R_VALUE is 600, then that ID should be assigned R_Q = 4.
And these ranges are different across different GROUP_BY groups (but consistent within each GROUP_BY groups), so they need to work with different GROUP_BY groups as well.

So the end result I desire would be something along the lines of

data.frame(
  stringsAsFactors = FALSE,
                ID = c("1234567","2345678","3456789",
                       "4567890","7654321","8765432","9875643","0987654",
                       "1029384","2938475"),
           R_VALUE = c(981L,931L,1074L,925L,981L,
                       94L,92L,84L,1001L,1074L),
               R_Q = c(1, 1, 1, 1, 1, 4, 4, 4, 1, 1),
          GROUP_BY = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
)

First thing that came to my mind was to do a join, then filter. However, when doing a join, it would inflate the number of rows, which is not scalable. I'm unsure if there are other options.

Hi @pathos
You can try using case_when() but its a bit clunky:

suppressPackageStartupMessages(library(tidyverse))

df_r_ranges = data.frame(
  stringsAsFactors = FALSE,
          GROUP_BY = c(NA, NA, NA, NA, NA),
               R_Q = c("4", "1", "3", "2", "5"),
           R_RANGE = c("591-1161","1882-2303",
                       "1162-1527","1528-1878","0-590"),
             R_MIN = c(591, 1882, 1162, 1528, 0),
             R_MAX = c(1161, 2303, 1527, 1878, 590)
)

# Sort the ranges for simple indexing
df_r_ranges %>% 
  arrange(R_Q) -> df_r_ranges

df_r_ranges
#>   GROUP_BY R_Q   R_RANGE R_MIN R_MAX
#> 1       NA   1 1882-2303  1882  2303
#> 2       NA   2 1528-1878  1528  1878
#> 3       NA   3 1162-1527  1162  1527
#> 4       NA   4  591-1161   591  1161
#> 5       NA   5     0-590     0   590


df_r_values = data.frame(
  stringsAsFactors = FALSE,
                ID = c("1234567","2345678","3456789",
                       "4567890","7654321","8765432","9875643","0987654",
                       "1029384","2938475"),
          GROUP_BY = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
           R_VALUE = c(9L, 28L, 17L, 605L, 154L, 0L, 596L, 549L, 3L, 0L)
)

df_r_values %>% 
  mutate(R_Q = case_when(
    R_VALUE <= df_r_ranges$R_MAX[1] & R_VALUE >= df_r_ranges$R_MIN[1] ~ 1,
    R_VALUE <= df_r_ranges$R_MAX[2] & R_VALUE >= df_r_ranges$R_MIN[2] ~ 2,
    R_VALUE <= df_r_ranges$R_MAX[3] & R_VALUE >= df_r_ranges$R_MIN[3] ~ 3,
    R_VALUE <= df_r_ranges$R_MAX[4] & R_VALUE >= df_r_ranges$R_MIN[4] ~ 4,
    R_VALUE <= df_r_ranges$R_MAX[5] & R_VALUE >= df_r_ranges$R_MIN[5] ~ 5,
    .default = 99
  ))
#>         ID GROUP_BY R_VALUE R_Q
#> 1  1234567       NA       9   5
#> 2  2345678       NA      28   5
#> 3  3456789       NA      17   5
#> 4  4567890       NA     605   4
#> 5  7654321       NA     154   5
#> 6  8765432       NA       0   5
#> 7  9875643       NA     596   4
#> 8  0987654       NA     549   5
#> 9  1029384       NA       3   5
#> 10 2938475       NA       0   5

Created on 2023-06-13 with reprex v2.0.2

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.