Extract the First ID

Hi!

I could use help with cleaning some dirty string data. I want to extract the first ID value). The first 9 rows are precisely what I need. However, rows 10-21 are the problem.

I've tried using functions like str_extract and str_remove from the stringr package, but I can't figure out a pattern to remove the unwanted strings.

Could someone help me with the Stringr and/or Regex formula that will help me achieve this?

Thanks in advance,

James

Reprex

structure(list(con_number = c("CON16552", "CON15607", "CON15607", 
"CON014592", "CON012146", "CON014085", "SP00012088", "SP00012088", 
"SP00012088", "CON016107/CON017440", "CON016107/CON017440", "CON016107/CON017440", 
"CON015304 (primary CON#)", "CON014838 (previous CON)", "CON015304 (primary CON#)", 
"CON012407                               ( Amendment:  CON017074)", 
"CON012407                               ( Amendment:  CON017074)", 
"CON012407                               ( Amendment:  CON017074)", 
"CON015103                         [CON012429 (CON number for this award - this is a supplement)]", 
"CON015103                         [CON012429 (CON number for this award - this is a supplement)]", 
"CON015103                         [CON012429 (CON number for this award - this is a supplement)]"
)), row.names = c(NA, -21L), class = c("tbl_df", "tbl", "data.frame"
), na.action = structure(22:24, names = c("22", "23", "24"), class = "omit"))

Does this do what you want?

DF <- structure(list(con_number = c("CON16552", "CON15607", "CON15607", 
                              "CON014592", "CON012146", "CON014085", "SP00012088", "SP00012088", 
                              "SP00012088", "CON016107/CON017440", "CON016107/CON017440", "CON016107/CON017440", 
                              "CON015304 (primary CON#)", "CON014838 (previous CON)", "CON015304 (primary CON#)", 
                              "CON012407                               ( Amendment:  CON017074)", 
                              "CON012407                               ( Amendment:  CON017074)", 
                              "CON012407                               ( Amendment:  CON017074)", 
                              "CON015103                         [CON012429 (CON number for this award - this is a supplement)]", 
                              "CON015103                         [CON012429 (CON number for this award - this is a supplement)]", 
                              "CON015103                         [CON012429 (CON number for this award - this is a supplement)]"
)), row.names = c(NA, -21L), class = c("tbl_df", "tbl", "data.frame"
), na.action = structure(22:24, names = c("22", "23", "24"), class = "omit"))
library(tidyverse)
DF <- DF |> mutate(New_con_number = str_extract(con_number, "^[:alnum:]+"))
DF[10:21,"New_con_number"] #Show problem rows
#> # A tibble: 12 × 1
#>    New_con_number
#>    <chr>         
#>  1 CON016107     
#>  2 CON016107     
#>  3 CON016107     
#>  4 CON015304     
#>  5 CON014838     
#>  6 CON015304     
#>  7 CON012407     
#>  8 CON012407     
#>  9 CON012407     
#> 10 CON015103     
#> 11 CON015103     
#> 12 CON015103

Created on 2023-06-07 with reprex v2.0.2

I'm unsure of your intent for rows 10-12

# more convenient to work with vectors in dealing
# with a single column data frame
v <- c(
  "CON16552", "CON15607", "CON15607",
  "CON014592", "CON012146", "CON014085", "SP00012088", "SP00012088",
  "SP00012088", "CON016107/CON017440", "CON016107/CON017440", "CON016107/CON017440",
  "CON015304 (primary CON#)", "CON014838 (previous CON)", "CON015304 (primary CON#)",
  "CON012407                               ( Amendment:  CON017074)",
  "CON012407                               ( Amendment:  CON017074)",
  "CON012407                               ( Amendment:  CON017074)",
  "CON015103                         [CON012429 (CON number for this award - this is a supplement)]",
  "CON015103                         [CON012429 (CON number for this award - this is a supplement)]",
  "CON015103                         [CON012429 (CON number for this award - this is a supplement)]"
)

# match everything from the first blank through
# the end of the string
omit <- " .*$"
# substitute the target string with nothing
# create a new data frame with the column
# name con_number
d <- data.frame(con_number = gsub(omit,"",v))
d
#>             con_number
#> 1             CON16552
#> 2             CON15607
#> 3             CON15607
#> 4            CON014592
#> 5            CON012146
#> 6            CON014085
#> 7           SP00012088
#> 8           SP00012088
#> 9           SP00012088
#> 10 CON016107/CON017440
#> 11 CON016107/CON017440
#> 12 CON016107/CON017440
#> 13           CON015304
#> 14           CON014838
#> 15           CON015304
#> 16           CON012407
#> 17           CON012407
#> 18           CON012407
#> 19           CON015103
#> 20           CON015103
#> 21           CON015103

FJCC's solution is perfect! Thanks to both of you!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.