Wrong pattern in grepl - regular expressions

Hi,
I have a list of car models:

source <- data.frame(
  stringsAsFactors = FALSE,
               URN = c("GB0421600680972799",
                       "GB0414160870904854","GB0411001930874092","GB0421589830971661",
                       "GB0411001960875261","GB0409850670845457",
                       "GB0418417380942180","GB0414194530911963","GB0417357570936378",
                       "GB0217347470934167","GB0415237340919038",
                       "GB0419459860949630","GB0420557080966254","GB0408770490827808",
                       "GB0216303060926766","GB0418417380941723","GB0210937840862352"),
        FamilyName = c("2008 (A94)","e2008",
                       "2008 (A94)","2008 V2 (P24)","2008 V2 (P24)","e208",
                       "208 (A9)","208 (A9)","208 V2 (P21)","208 V2 (P21)",
                       "BERLINGO (K9 EUROPE)","Corsa 15",
                       "BERLINGO V2 (M59)","Corsa 07","New 2008",
                       "BERLINGO V3 (B9)","CORSA (P2JO)")
)

where I am trying to recode names:

library(dplyr)
result <- source %>% 
  mutate(FamilyNameCat = case_when(
    grepl(x = FamilyName, pattern = 'Nuovo\\s2008|NEUER\\s2008|New\\s2008|Nuevo\\s2008|Nuova\\s2008|NV\\s2008|2008\\sNeu|2008\\sNlle', ignore.case = TRUE) ~ 'Peugeot 2008 New',
    grepl(x = FamilyName, pattern = 'e2008', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = '2008\\sV2', ignore.case = TRUE) ~ 'Peugeot 2008 V2',
    grepl(x = FamilyName, pattern = '2008', ignore.case = TRUE) ~ 'Peugeot 2008',
    grepl(x = FamilyName, pattern = 'e208', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = '208\\sV2', ignore.case = TRUE) ~ 'Peugeot 208 V2',
    grepl(x = FamilyName, pattern = '208', ignore.case = TRUE) ~ 'Peugeot 208',
    grepl(x = FamilyName, pattern = 'Berlingo\\sV2', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = 'Berlingo\\sV3', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = 'Berlingo', ignore.case = TRUE) ~ 'Citroen Berlingo K9',
    grepl(x = FamilyName, pattern = 'Corsa\\s15', ignore.case = TRUE) ~ 'Vauxhall Corsa 15',
    grepl(x = FamilyName, pattern = 'Corsa&P2', ignore.case = TRUE) ~ 'Vauxhall Corsa P2',
    TRUE ~ "Other"
  ))

I don't know why the same pattern works for 2008 but does not for 208.
Also, I don't know how to fix Berlingo recoding and how to code Corsa (P2JO).
Can you help?

I also have noticed something odd, which probably impacts my grepl.
When I use these:

library(tidyverse)
source$FamilyName <-str_replace_all(source$FamilyName, "[)]", "")
source$FamilyName <-str_replace_all(source$FamilyName, "[(]", "")

source$FamilyName <-str_replace_all(source$FamilyName, " ", "")
source$FamilyName <-str_replace_all(source$FamilyName, " ", "")

Spaces in some names stay unremoved! This could be a reason why 2008 works but 208 doesn't. Can I fix it?

What do you mean with e208 doesn't work?

library(tidyverse)

source <- data.frame(
  stringsAsFactors = FALSE,
  URN = c("GB0421600680972799",
          "GB0414160870904854","GB0411001930874092","GB0421589830971661",
          "GB0411001960875261","GB0409850670845457",
          "GB0418417380942180","GB0414194530911963","GB0417357570936378",
          "GB0217347470934167","GB0415237340919038",
          "GB0419459860949630","GB0420557080966254","GB0408770490827808",
          "GB0216303060926766","GB0418417380941723","GB0210937840862352"),
  FamilyName = c("2008 (A94)","e2008",
                 "2008 (A94)","2008 V2 (P24)","2008 V2 (P24)","e208",
                 "208 (A9)","208 (A9)","208 V2 (P21)","208 V2 (P21)",
                 "BERLINGO (K9 EUROPE)","Corsa 15",
                 "BERLINGO V2 (M59)","Corsa 07","New 2008",
                 "BERLINGO V3 (B9)","CORSA (P2JO)")
)

result <- source %>% 
  mutate(FamilyNameCat = case_when(
    grepl(x = FamilyName, pattern = 'Nuovo\\s2008|NEUER\\s2008|New\\s2008|Nuevo\\s2008|Nuova\\s2008|NV\\s2008|2008\\sNeu|2008\\sNlle', ignore.case = TRUE) ~ 'Peugeot 2008 New',
    grepl(x = FamilyName, pattern = 'e2008', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = '2008\\sV2', ignore.case = TRUE) ~ 'Peugeot 2008 V2',
    grepl(x = FamilyName, pattern = '2008', ignore.case = TRUE) ~ 'Peugeot 2008',
    grepl(x = FamilyName, pattern = 'e208', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = '208\\sV2', ignore.case = TRUE) ~ 'Peugeot 208 V2',
    grepl(x = FamilyName, pattern = '208', ignore.case = TRUE) ~ 'Peugeot 208',
    grepl(x = FamilyName, pattern = 'Berlingo\\sV2', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = 'Berlingo\\sV3', ignore.case = TRUE) ~ 'Other',
    grepl(x = FamilyName, pattern = 'Berlingo', ignore.case = TRUE) ~ 'Citroen Berlingo K9',
    grepl(x = FamilyName, pattern = 'Corsa\\s15', ignore.case = TRUE) ~ 'Vauxhall Corsa 15',
    grepl(x = FamilyName, pattern = 'Corsa&P2', ignore.case = TRUE) ~ 'Vauxhall Corsa P2',
    TRUE ~ "Other"
  ))


result
#>                   URN           FamilyName       FamilyNameCat
#> 1  GB0421600680972799           2008 (A94)        Peugeot 2008
#> 2  GB0414160870904854                e2008               Other
#> 3  GB0411001930874092           2008 (A94)        Peugeot 2008
#> 4  GB0421589830971661        2008 V2 (P24)     Peugeot 2008 V2
#> 5  GB0411001960875261        2008 V2 (P24)     Peugeot 2008 V2
#> 6  GB0409850670845457                 e208               Other
#> 7  GB0418417380942180             208 (A9)         Peugeot 208
#> 8  GB0414194530911963             208 (A9)         Peugeot 208
#> 9  GB0417357570936378         208 V2 (P21)      Peugeot 208 V2
#> 10 GB0217347470934167         208 V2 (P21)      Peugeot 208 V2
#> 11 GB0415237340919038 BERLINGO (K9 EUROPE) Citroen Berlingo K9
#> 12 GB0419459860949630             Corsa 15   Vauxhall Corsa 15
#> 13 GB0420557080966254    BERLINGO V2 (M59)               Other
#> 14 GB0408770490827808             Corsa 07               Other
#> 15 GB0216303060926766             New 2008    Peugeot 2008 New
#> 16 GB0418417380941723     BERLINGO V3 (B9)               Other
#> 17 GB0210937840862352         CORSA (P2JO)               Other




source$FamilyName <-str_replace_all(source$FamilyName, "[)]", "")
source$FamilyName <-str_replace_all(source$FamilyName, "[(]", "")

source$FamilyName <-str_replace_all(source$FamilyName, " ", "")

source
#>                   URN       FamilyName
#> 1  GB0421600680972799          2008A94
#> 2  GB0414160870904854            e2008
#> 3  GB0411001930874092          2008A94
#> 4  GB0421589830971661        2008V2P24
#> 5  GB0411001960875261        2008V2P24
#> 6  GB0409850670845457             e208
#> 7  GB0418417380942180            208A9
#> 8  GB0414194530911963            208A9
#> 9  GB0417357570936378         208V2P21
#> 10 GB0217347470934167         208V2P21
#> 11 GB0415237340919038 BERLINGOK9EUROPE
#> 12 GB0419459860949630          Corsa15
#> 13 GB0420557080966254    BERLINGOV2M59
#> 14 GB0408770490827808          Corsa07
#> 15 GB0216303060926766          New2008
#> 16 GB0418417380941723     BERLINGOV3B9
#> 17 GB0210937840862352        CORSAP2JO
source$FamilyName
#>  [1] "2008A94"          "e2008"            "2008A94"          "2008V2P24"       
#>  [5] "2008V2P24"        "e208"             "208A9"            "208A9"           
#>  [9] "208V2P21"         "208V2P21"         "BERLINGOK9EUROPE" "Corsa15"         
#> [13] "BERLINGOV2M59"    "Corsa07"          "New2008"          "BERLINGOV3B9"    
#> [17] "CORSAP2JO"

Created on 2022-07-08 by the reprex package (v2.0.1)

This looks like the results I would expect based on your regexes. So, what did you expect but didn't get?

Could you share the result on your computer? Could be a locale thing. As, if I run this exact code, I don't see unremoved space.

Could also be a difference in the copy-pasting, e.g. if you have some NBSP (non-breaking space), they look like a space but won't be removed by the str_replace_all.

Thank you. Restart of my computer helped.
How can I fix my Corsa (P2JO) code?

Again, I'm not sure what's wrong with it in the first place. If it's the parentheses that are a problem, you can simply escape them:

grepl(x = FamilyName, pattern = 'Corsa\\s\\(P2JO\\)', ignore.case = TRUE) ~ 'Vauxhall Corsa P2JO',

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.