Four stages of recoding in one (including regex)

Slavek · February 18, 2021, 9:29am

Hi, I have following sample data frame with information about car service including car mileage, Months gap to previous visit and service description.

data.source <- data.frame(
     stringsAsFactors = FALSE,
                  URN = c("VE1197209","VE1201008",
                          "VE1198331","VE1201587","VE1204103","VE1197865",
                          "VE1198285","VE1196959","VE1204735","VE1198353",
                          "VE1201307","VE1198384","VE1203567","VE1196943"),
              Mileage = c(8041,15526,25597,80628,
                          26860,57279,85315,9164,127021,148178,4736,
                          23677,27391,16921),
            MonthsGap = c("11.76195",
                          "22.57","35.186818","118.20916",
                          "22.1444","63.835721",
                          "93.9","11.20346","59.8608563",
                          "191.9671554","11.79451","71.88405",
                          "71.8549","36.59"),
    SERVICE_DESCRIPTION = c(NA,
                          ",18000 MILES SERVICE OR 2 YEARS","87.5K Service","Annual Service",
                          " 27000 mile (3rd year)",
                          " 54000 mile (6th year)"," 84000 MILE SERVICE",
                          "Carry out   12 ,500 mile",
                          NA,
                          "first Service  activate AA","MOT",
                          "Carry out 45.000 Mile ",
                          "EIGHT YEAR","7th Annual service")
)

data.source

To make it right I have to run rules in following order:

1. If SERVICE_DESCRIPTION contains valid description, use that:

 library(dplyr)

final.data <- data.source %>% 
  mutate(
    First.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("1st|9k|9000|9,000|9//smonths|9//smonth|12k|12000|12,000|12//smonths|12//smonth|12.5k|12500|12,500|12.500|one//syear|first|12//s,500", ignore_case = TRUE, multiline = TRUE)), 1,0),
    Second.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("2nd|18k|18000|18.000|18,000|18//smonths|18//smonths|24k|24000|24,000|24//smonths|24//smonth|25k|25000|25,000|25.000|two//syears|second|two//syear|25//s,000", ignore_case = TRUE, multiline = TRUE)), 1,0),
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                                                  6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                                                  7th|63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                                                  8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear", ignore_case = TRUE, multiline = TRUE)), 1,0)) 

final.data <- final.data %>% 
  mutate(
    GapBucketsTM = case_when(
      First.Service == 1 ~ "First Service",
      Second.Service == 1 ~ "Second Service",
      Seventh.Service == 1 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

my first problem here is the fact that:
a) "Carry out 12 ,500 mile servi" from the first str_detect code (First.Service) and
b) last two from the trird str_detect code (Seventh.Service containing "EIGHT YEAR" and "7th")
are not picked up.

I think the problema are:
a) is a weird way or writing a number "12 ,500". Is any easier way of picking up all versions of 12500 (12.500, 12,500, 12, 500, 12.5k)?
b) multiline? the code is quite long

2. If the first criterium not met use the second taking into account a combination of Months gap and Mileage

final.data <- final.data %>% 
  mutate(
    GapBuckets1 = case_when(
      GapBucketsTM == "Other" & MonthsGap > 0 & MonthsGap < 18 & Mileage <= 13000 ~ "First Service",
      GapBucketsTM == "Other" & MonthsGap >= 18 & MonthsGap < 30 & Mileage <= 26000 ~ "Second Service",
      TRUE ~ "Other"
    ))

3. If the second criterium is not met, take into account MonthsGap:

final.data <- final.data %>% 
  mutate(
    GapBuckets2 = case_when(
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 0 & MonthsGap <= 15 ~ "First Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 21 & MonthsGap <= 27 ~ "Second Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 77 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

4. If the above criterium is not met, use the final one taking into account only Mileage

final.data <- final.data %>% 
  mutate(
    GapBuckets3 = case_when(
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 0 & Mileage <= 12500 ~ "First Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 12500 & Mileage <= 25000 ~ "Second Service",
      GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 75000 ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

Finally, create service categories based on all the above:

final.data <- final.data %>% 
  mutate(
    GapBuckets = case_when(
      GapBucketsTM == "First Service" | GapBuckets1 == "First Service" | GapBuckets2 == "First Service" | GapBuckets3 == "First Service"  ~ "First Service",
      GapBucketsTM == "Second Service" | GapBuckets1 == "Second Service" | GapBuckets2 == "Second Service" | GapBuckets3 == "Second Service" ~ "Second Service",
      GapBucketsTM == "Seventh+ Service" | GapBuckets1 == "Seventh+ Service" | GapBuckets2 == "Seventh+ Service" | GapBuckets3 == "Seventh+ Service" ~ "Seventh+ Service",
      TRUE ~ "Other"
    ))

Do I really need to take so many stages? Can I do everything above in less (or maybe one coding)?

lars · February 18, 2021, 10:41am

Hi,

May I suggest to first clean up the input data first?

As you already use the stringr package, you may find str_trim(), or in your case even better, str_squish() useful.

data.source %>% 
  mutate(SERVICE_DESCRIPTION = SERVICE_DESCRIPTION %>% 
           str_squish() %>% 
           str_to_lower()
         )

You can also opt-in to put all descriptions in lowercase so that the str_detect() can become a bit simpler:

First.Service = if_else(str_detect(SERVICE_DESCRIPTION, "1st|9k|9000|9,000|9//smonths|9//smonth|12k|12000|12,000|12//smonths|12//smonth|12.5k|12500|12,500|12.500|one//syear|first|12//s,500"),
                        1,
                        0
                        ),

Slavek · February 18, 2021, 10:56am

The firs option is more useful than the second solution I think but I wonder why
ignore_case = TRUE, multiline = TRUE in my code do not work.

Anyway, this useful solution did not help in my case as two last values in SERVICE_DESCRIPTION are still not recoded ( eight year and 7th annual service)...

lars · February 18, 2021, 11:28am

Hi Slavek,

Well, you're regex looks quite complex and looking at the Seventh.Service one, it is apparently coming to a different conclusion.. When I move the 7th| to the start of the pattern, then it does match with the URN VE1196943.

I'm not a die-hard expert on constructing a robust regular espression, but maybe the RegExplain addin can be helpful?

Slavek · February 18, 2021, 11:39am

You are confirming my suspicion about multiline = TRUE issue...

lars · February 18, 2021, 12:13pm

I'm not sure if that is causing the issue.

I simplified your code a little bit - including removing the regex() part so that only the search pattern remains - and as you can see, moving the search part 7th to the start of the pattern is giving a different result.

library(tidyverse)

data.source <- data.frame(
  stringsAsFactors = FALSE,
  URN = c("VE1197209","VE1201008",
          "VE1198331","VE1201587","VE1204103","VE1197865",
          "VE1198285","VE1196959","VE1204735","VE1198353",
          "VE1201307","VE1198384","VE1203567","VE1196943"),
  Mileage = c(8041,15526,25597,80628,
              26860,57279,85315,9164,127021,148178,4736,
              23677,27391,16921),
  MonthsGap = c("11.76195",
                "22.57","35.186818","118.20916",
                "22.1444","63.835721",
                "93.9","11.20346","59.8608563",
                "191.9671554","11.79451","71.88405",
                "71.8549","36.59"),
  SERVICE_DESCRIPTION = c(NA,
                          ",18000 MILES SERVICE OR 2 YEARS","87.5K Service","Annual Service",
                          " 27000 mile (3rd year)",
                          " 54000 mile (6th year)"," 84000 MILE SERVICE",
                          "Carry out   12 ,500 mile",
                          NA,
                          "first Service  activate AA","MOT",
                          "Carry out 45.000 Mile ",
                          "EIGHT YEAR","7th Annual service")
  ) %>% 
  ## tidy the input data
  mutate(SERVICE_DESCRIPTION = SERVICE_DESCRIPTION %>% 
           str_squish() %>% 
           str_to_lower()
         )

## URN VE1196943 is missing
data.source %>% 
  mutate(
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, 
                                         "5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                         6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                         7th|63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                          8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear"
                                         ), 
                              1,
                              0
                              )
    ) %>% 
  filter(Seventh.Service == 1)
#>         URN Mileage MonthsGap   SERVICE_DESCRIPTION Seventh.Service
#> 1 VE1198331   25597 35.186818         87.5k service               1
#> 2 VE1197865   57279 63.835721 54000 mile (6th year)               1
#> 3 VE1198285   85315      93.9    84000 mile service               1
#> 4 VE1198384   23677  71.88405 carry out 45.000 mile               1

## URN VE1196943 is flagged
data.source %>% 
  mutate(
    Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, 
                                         "7th|5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
                                         6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
                                         63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
                                         8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear"
                                         ), 
                              1,
                              0
    )
  ) %>% 
  filter(Seventh.Service == 1)
#>         URN Mileage MonthsGap   SERVICE_DESCRIPTION Seventh.Service
#> 1 VE1198331   25597 35.186818         87.5k service               1
#> 2 VE1197865   57279 63.835721 54000 mile (6th year)               1
#> 3 VE1198285   85315      93.9    84000 mile service               1
#> 4 VE1198384   23677  71.88405 carry out 45.000 mile               1
#> 5 VE1196943   16921     36.59    7th annual service               1

^{Created on 2021-02-18 by the reprex package (v1.0.0)}

lars · February 18, 2021, 12:21pm

BTW, I notice that you use the pattern //s.
Did you mean to the use the "whitespace character"? If so, then it should be like so: \s

Slavek · February 18, 2021, 12:33pm

I used that expression as 12, 500 has a space between "," and "500"...

system · March 11, 2021, 12:33pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.