I’ll also give it a shot. First, are the strings in L1_loc
longer than what you show us here? If not there are other solutions possible, which do not require regex at all (either your case_when
without the str_detect
like this L1_loc == "RAPZpm" ~ "RAP"
or with a lookup table and a left_join
).
Anyway, here are two alternatives, depending on whether the second but last case is indeed a typo or not.
library(stringr)
suppressPackageStartupMessages(library(dplyr))
xdf <- tibble(
L1_loc = c(
"RAPZpm",
"RAPZpl",
"RAPZa",
"RATZa",
"RAPTZp",
"LAPZp",
"LATZp",
"LAPZa",
"LAPTZa",
"RAPZa"
)
)
This one assumes that your second but last case_when
is not a typo. It makes use of the different string lengths, i.e. that strings with 6 word characters from the beginning always end with a “p”. "^(\\w{2})\\w{2}(\\w)" = "\\1\\2"
means capture the first two word characters (^(\\w{2})
) and the fifth character ((\\w)
) and put them in the replacement string (\\1
is the first and \\2
is the second capture group).
xdf %>%
mutate(mr_reg_l1 =
str_replace_all(
L1_loc,
c(
"^(\\w{2})\\w{4}" = "\\1P",
"^(\\w{2})\\w{2}(\\w)" = "\\1\\2"
)
) %>% str_to_upper())
#> # A tibble: 10 x 2
#> L1_loc mr_reg_l1
#> <chr> <chr>
#> 1 RAPZpm RAP
#> 2 RAPZpl RAP
#> 3 RAPZa RAA
#> 4 RATZa RAA
#> 5 RAPTZp RAP
#> 6 LAPZp LAP
#> 7 LATZp LAP
#> 8 LAPZa LAA
#> 9 LAPTZa LAP
#> 10 RAPZa RAA
This one is similar to @Stroehli’s solution in that it makes use of the upper/lower case letters, but does not assume that the strings ends after the lower case letters.
xdf %>%
mutate(mr_reg_l1 =
str_replace_all(L1_loc,
c("^(\\w{2})[:upper:]{2,3}(\\w{1}).*" = "\\1\\2")) %>%
str_to_upper()
)
#> # A tibble: 10 x 2
#> L1_loc mr_reg_l1
#> <chr> <chr>
#> 1 RAPZpm RAP
#> 2 RAPZpl RAP
#> 3 RAPZa RAA
#> 4 RATZa RAA
#> 5 RAPTZp RAP
#> 6 LAPZp LAP
#> 7 LATZp LAP
#> 8 LAPZa LAA
#> 9 LAPTZa LAA
#> 10 RAPZa RAA
Now the case_when
without a reprex:
xdf %>%
mutate(
mr_reg_l1 = case_when(
L1_loc == "RAPZpm" ~ "RAP",
L1_loc == "RAPZpl" ~ "RAP",
L1_loc == "RAPZa" ~ "RAA",
L1_loc == "RATZa" ~ "RAA",
L1_loc == "RAPTZp" ~ "RAP",
L1_loc == "LAPZp" ~ "LAP",
L1_loc == "LATZp" ~ "LAP",
L1_loc == "LAPZa" ~ "LAA",
L1_loc == "LAPTZa" ~ "LAP",
L1_loc == "RAPZa" ~ "RAA",
TRUE ~ L1_loc
)
)
#> # A tibble: 10 x 2
#> L1_loc mr_reg_l1
#> <chr> <chr>
#> 1 RAPZpm RAP
#> 2 RAPZpl RAP
#> 3 RAPZa RAA
#> 4 RATZa RAA
#> 5 RAPTZp RAP
#> 6 LAPZp LAP
#> 7 LATZp LAP
#> 8 LAPZa LAA
#> 9 LAPTZa LAP
#> 10 RAPZa RAA
And finally the lookup table - left_join
lookup_df <- tribble(
~L1_loc, ~mr_reg_l1,
"RAPZpm", "RAP",
"RAPZpl", "RAP",
"RAPZa", "RAA",
"RATZa", "RAA",
"RAPTZp", "RAP",
"LAPZp", "LAP",
"LATZp", "LAP",
"LAPZa", "LAA",
"LAPTZa", "LAP",
"RAPZa", "RAA"
)
left_join(xdf, lookup_df, by = "L1_loc")
#> # A tibble: 12 x 2
#> L1_loc mr_reg_l1
#> <chr> <chr>
#> 1 RAPZpm RAP
#> 2 RAPZpl RAP
#> 3 RAPZa RAA
#> 4 RAPZa RAA
#> 5 RATZa RAA
#> 6 RAPTZp RAP
#> 7 LAPZp LAP
#> 8 LATZp LAP
#> 9 LAPZa LAA
#> 10 LAPTZa LAP
#> 11 RAPZa RAA
#> 12 RAPZa RAA