Four stages of recoding in one (including regex)

Hi, I have following sample data frame with information about car service including car mileage, Months gap to previous visit and service description.

data.source <- data.frame(
     stringsAsFactors = FALSE,
                  URN = c("VE1197209","VE1201008",
                          "VE1198331","VE1201587","VE1204103","VE1197865",
                          "VE1198285","VE1196959","VE1204735","VE1198353",
                          "VE1201307","VE1198384","VE1203567","VE1196943"),
              Mileage = c(8041,15526,25597,80628,
                          26860,57279,85315,9164,127021,148178,4736,
                          23677,27391,16921),
            MonthsGap = c("11.76195",
                          "22.57","35.186818","118.20916",
                          "22.1444","63.835721",
                          "93.9","11.20346","59.8608563",
                          "191.9671554","11.79451","71.88405",
                          "71.8549","36.59"),
    SERVICE_DESCRIPTION = c(NA,
                          ",18000 MILES SERVICE OR 2 YEARS","87.5K Service","Annual Service",
                          " 27000 mile (3rd year)",
                          " 54000 mile (6th year)"," 84000 MILE SERVICE",
                          "Carry out   12 ,500 mile",
                          NA,
                          "first Service  activate AA","MOT",
                          "Carry out 45.000 Mile ",
                          "EIGHT YEAR","7th Annual service")
)

data.source

To make it right I have to run rules in following order:

1. If SERVICE_DESCRIPTION contains valid description, use that:

 library(dplyr)

final.data <- data.source %>% 
  mutate(
    First.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("1st|9k|9000|9,000|9//smonths|9//smonth|12k|12000|12,000|12//smonths|12//smonth|12.5k|12500|12,500|12.500|one//syear|first|12//s,500", ignore_case = TRUE, multiline = TRUE)), 1,0),
    Second.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("2nd|18k|18000|18.000|18,000|18//smonths|18//smonths|24k|24000|24,000|24//smonths|24//smonth|25k|25000|25,000|25.000|two//syears|second|two//syear|25//s,000", ignore_case = TRUE, multiline = TRUE)), 1,0),
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                                                  6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                                                  7th|63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                                                  8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear", ignore_case = TRUE, multiline = TRUE)), 1,0)) 

final.data <- final.data %>% 
  mutate(
    GapBucketsTM = case_when(
      First.Service == 1 ~ "First Service",
      Second.Service == 1 ~ "Second Service",
      Seventh.Service == 1 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

my first problem here is the fact that:
a) "Carry out 12 ,500 mile servi" from the first str_detect code (First.Service) and
b) last two from the trird str_detect code (Seventh.Service containing "EIGHT YEAR" and "7th")
are not picked up.

I think the problema are:
a) is a weird way or writing a number "12 ,500". Is any easier way of picking up all versions of 12500 (12.500, 12,500, 12, 500, 12.5k)?
b) multiline? the code is quite long

2. If the first criterium not met use the second taking into account a combination of Months gap and Mileage

final.data <- final.data %>% 
  mutate(
    GapBuckets1 = case_when(
      GapBucketsTM == "Other" & MonthsGap > 0 & MonthsGap < 18 & Mileage <= 13000 ~ "First Service",
      GapBucketsTM == "Other" & MonthsGap >= 18 & MonthsGap < 30 & Mileage <= 26000 ~ "Second Service",
      TRUE ~ "Other"
    ))

3. If the second criterium is not met, take into account MonthsGap:

final.data <- final.data %>% 
  mutate(
    GapBuckets2 = case_when(
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 0 & MonthsGap <= 15 ~ "First Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 21 & MonthsGap <= 27 ~ "Second Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 77 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

4. If the above criterium is not met, use the final one taking into account only Mileage

final.data <- final.data %>% 
  mutate(
    GapBuckets3 = case_when(
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 0 & Mileage <= 12500 ~ "First Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 12500 & Mileage <= 25000 ~ "Second Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 75000 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

Finally, create service categories based on all the above:

final.data <- final.data %>% 
  mutate(
    GapBuckets = case_when(
      GapBucketsTM == "First Service" | GapBuckets1 == "First Service" | GapBuckets2 == "First Service" | GapBuckets3 == "First Service"  ~ "First Service",
      GapBucketsTM == "Second Service" | GapBuckets1 == "Second Service" | GapBuckets2 == "Second Service" | GapBuckets3 == "Second Service" ~ "Second Service",
      GapBucketsTM == "Seventh+ Service" | GapBuckets1 == "Seventh+ Service" | GapBuckets2 == "Seventh+ Service" | GapBuckets3 == "Seventh+ Service" ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

Do I really need to take so many stages? Can I do everything above in less (or maybe one coding)?

Hi,

May I suggest to first clean up the input data first?

As you already use the stringr package, you may find str_trim(), or in your case even better, str_squish() useful.

data.source %>% 
  mutate(SERVICE_DESCRIPTION = SERVICE_DESCRIPTION %>% 
           str_squish() %>% 
           str_to_lower()
         )

You can also opt-in to put all descriptions in lowercase so that the str_detect() can become a bit simpler:

First.Service = if_else(str_detect(SERVICE_DESCRIPTION, "1st|9k|9000|9,000|9//smonths|9//smonth|12k|12000|12,000|12//smonths|12//smonth|12.5k|12500|12,500|12.500|one//syear|first|12//s,500"),
                        1,
                        0
                        ),

The firs option is more useful than the second solution I think but I wonder why
ignore_case = TRUE, multiline = TRUE in my code do not work.

Anyway, this useful solution did not help in my case as two last values in SERVICE_DESCRIPTION are still not recoded ( eight year and 7th annual service)...

Hi Slavek,

Well, you're regex looks quite complex and looking at the Seventh.Service one, it is apparently coming to a different conclusion.. When I move the 7th| to the start of the pattern, then it does match with the URN VE1196943.

I'm not a die-hard expert on constructing a robust regular espression, but maybe the RegExplain addin can be helpful?

You are confirming my suspicion about multiline = TRUE issue...

I'm not sure if that is causing the issue.

I simplified your code a little bit - including removing the regex() part so that only the search pattern remains - and as you can see, moving the search part 7th to the start of the pattern is giving a different result.

library(tidyverse)

data.source <- data.frame(
  stringsAsFactors = FALSE,
  URN = c("VE1197209","VE1201008",
          "VE1198331","VE1201587","VE1204103","VE1197865",
          "VE1198285","VE1196959","VE1204735","VE1198353",
          "VE1201307","VE1198384","VE1203567","VE1196943"),
  Mileage = c(8041,15526,25597,80628,
              26860,57279,85315,9164,127021,148178,4736,
              23677,27391,16921),
  MonthsGap = c("11.76195",
                "22.57","35.186818","118.20916",
                "22.1444","63.835721",
                "93.9","11.20346","59.8608563",
                "191.9671554","11.79451","71.88405",
                "71.8549","36.59"),
  SERVICE_DESCRIPTION = c(NA,
                          ",18000 MILES SERVICE OR 2 YEARS","87.5K Service","Annual Service",
                          " 27000 mile (3rd year)",
                          " 54000 mile (6th year)"," 84000 MILE SERVICE",
                          "Carry out   12 ,500 mile",
                          NA,
                          "first Service  activate AA","MOT",
                          "Carry out 45.000 Mile ",
                          "EIGHT YEAR","7th Annual service")
  ) %>% 
  ## tidy the input data
  mutate(SERVICE_DESCRIPTION = SERVICE_DESCRIPTION %>% 
           str_squish() %>% 
           str_to_lower()
         )

## URN VE1196943 is missing
data.source %>% 
  mutate(
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, 
                                         "5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                         6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                         7th|63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                          8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear"
                                         ), 
                              1,
                              0
                              )
    ) %>% 
  filter(Seventh.Service == 1)
#>         URN Mileage MonthsGap   SERVICE_DESCRIPTION Seventh.Service
#> 1 VE1198331   25597 35.186818         87.5k service               1
#> 2 VE1197865   57279 63.835721 54000 mile (6th year)               1
#> 3 VE1198285   85315      93.9    84000 mile service               1
#> 4 VE1198384   23677  71.88405 carry out 45.000 mile               1

## URN VE1196943 is flagged
data.source %>% 
  mutate(
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, 
                                         "7th|5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                         6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                         63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                         8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear"
                                         ), 
                              1,
                              0
    )
  ) %>% 
  filter(Seventh.Service == 1)
#>         URN Mileage MonthsGap   SERVICE_DESCRIPTION Seventh.Service
#> 1 VE1198331   25597 35.186818         87.5k service               1
#> 2 VE1197865   57279 63.835721 54000 mile (6th year)               1
#> 3 VE1198285   85315      93.9    84000 mile service               1
#> 4 VE1198384   23677  71.88405 carry out 45.000 mile               1
#> 5 VE1196943   16921     36.59    7th annual service               1

Created on 2021-02-18 by the reprex package (v1.0.0)

BTW, I notice that you use the pattern //s.
Did you mean to the use the "whitespace character"? If so, then it should be like so: \s

I used that expression as 12, 500 has a space between "," and "500"...