Data Manipulation with tidyverse

Hello I have a sample dataset as follows.

# Load the tidyverse package
library(tidyverse)

# Create the dataset
id <- 1:6
model <- c("0RB3211", NA, "0RB4191",
           NA, "0RB4033", NA)
UPC <- c("805289119081", "DK_0RB3447CP_RBCP  50", "8053672006360",
         "Green_Classic_G-15_Polar_1.67_PREM_SV", "805289044604",
         "DK_0RB2132CP_RBCP  55")
df <- tibble(id, model, UPC)

For the missing values in 'model' column, if its corresponding UPC starts with DK, I need to extract 7-digit number and letter after first underscore and then put it into the column 'model'. For example, for the second row, I need to put "0RB3447" into the 'model' column, for the fourth row, I need to delete the whole row and for the last row, I need to put "0RB2132" into the 'model' column.

Here is my expected result:

# Manipulate the dataset
df_cleaned <- df %>%
  rowwise() %>%
  mutate(model = ifelse(is.na(model) & str_detect(UPC, "^DK"),
                        str_extract(UPC, "\\d{2}RB\\d{4}"),
                        model)) %>%
  ungroup() %>%
  filter(!(is.na(model) & str_detect(UPC, "[^0-9]")))

# Display the cleaned dataset
print(df_cleaned)

However, it only returns this wrong result.
image

How can I modify my previous code?
Really appreciate it.

I would filter first and then use ifelse() without using rowwise()

library(tidyverse)

# Create the dataset
id <- 1:6
model <- c("0RB3211", NA, "0RB4191",
           NA, "0RB4033", NA)
UPC <- c("805289119081", "DK_0RB3447CP_RBCP  50", "8053672006360",
         "Green_Classic_G-15_Polar_1.67_PREM_SV", "805289044604",
         "DK_0RB2132CP_RBCP  55")
df <- tibble(id, model, UPC)
df <- df |> filter(!is.na(model) | str_sub(UPC,1,2) == "DK") |> 
  mutate(model = ifelse(is.na(model), str_sub(UPC, 4, 10), model))

df
#> # A tibble: 5 × 3
#>      id model   UPC                  
#>   <int> <chr>   <chr>                
#> 1     1 0RB3211 805289119081         
#> 2     2 0RB3447 DK_0RB3447CP_RBCP  50
#> 3     3 0RB4191 8053672006360        
#> 4     5 0RB4033 805289044604         
#> 5     6 0RB2132 DK_0RB2132CP_RBCP  55

Created on 2023-04-04 with reprex v2.0.2

1 Like

Here is a base version, which also retains the fifth row

(d <- data.frame(id = 1:6,
  model = c("0RB3211", NA, "0RB4191",
            NA, "0RB4033", NA),
  UPC = c("805289119081", "DK_0RB3447CP_RBCP  50", "8053672006360",
          "Green_Classic_G-15_Polar_1.67_PREM_SV", "805289044604",
          "DK_0RB2132CP_RBCP  55")))
#>   id   model                                   UPC
#> 1  1 0RB3211                          805289119081
#> 2  2    <NA>                 DK_0RB3447CP_RBCP  50
#> 3  3 0RB4191                         8053672006360
#> 4  4    <NA> Green_Classic_G-15_Polar_1.67_PREM_SV
#> 5  5 0RB4033                          805289044604
#> 6  6    <NA>                 DK_0RB2132CP_RBCP  55

(missing <- which(is.na(d$model)))
#> [1] 2 4 6
(is_dk <- grep("^DK",d$UPC))
#> [1] 2 6
(to_pick <- intersect(missing,is_dk))
#> [1] 2 6
(pat <- "(^DK_)([A-Za-z0-9]{7})(.*$)")
#> [1] "(^DK_)([A-Za-z0-9]{7})(.*$)"
d[to_pick,2] <- sub(pat,"\\2",d[to_pick,3])
d
#>   id   model                                   UPC
#> 1  1 0RB3211                          805289119081
#> 2  2 0RB3447                 DK_0RB3447CP_RBCP  50
#> 3  3 0RB4191                         8053672006360
#> 4  4    <NA> Green_Classic_G-15_Polar_1.67_PREM_SV
#> 5  5 0RB4033                          805289044604
#> 6  6 0RB2132                 DK_0RB2132CP_RBCP  55

Created on 2023-04-04 with reprex v2.0.2

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.