Here's a method that seems 75x faster; you could also add in the line I commented out to have the number part be a numeric type.
EDIT : I added a third which is a more direct vectorisation of your original code; its twice as fast but not as fast as the tidyverse, split and conquer approach
library(tidyverse)
datasmall <- data.frame(stringsAsFactors = FALSE,
Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5),
Sepal.Width = c("3.5 s", "3", "3.2 s", "3.1 t", "3.6")
)
data <- purrr::map_dfr(1:1000,\(x)datasmall)
first_f <- function(data){
data$code<-NA
for (i in 1:nrow(data)) {
if (any(str_detect(data[i,ncol(data)-1],letters))) {
data$code[i]<-str_sub(data[i,ncol(data)-1],start = -1)
data[i,ncol(data)-1]<-str_sub(data[i,ncol(data)-1],end = -3)
}else{data$code[i]<-NA}
}
data
}
second_f <- function(data){
grpd <- data |>
mutate(rn=row_number()) |>
split(~str_detect(Sepal.Width,"[a-z]"))
grpd$`FALSE` <- grpd$`FALSE` |> mutate(code = NA_character_)
grpd$`TRUE` <- grpd$`TRUE` |> separate_wider_delim(cols=Sepal.Width,
delim=" ",
names=c("Sepal.Width","code"))
bind_rows(grpd) |>
# mutate(Sepal.Width=readr::parse_number(Sepal.Width)) |>
arrange(rn) |> select(-rn)
}
third_f <- function(data){
data$code<-NA
testvec <- sapply(data[,ncol(data)-1],\(x)any(str_detect(x,letters)))
data$code[testvec] <-str_sub(data[testvec,ncol(data)-1],start = -1)
data[testvec,ncol(data)-1]<-str_sub(data[testvec,ncol(data)-1],end = -3)
data
}
library(bench)
mark(
first_f(data),
second_f(data),
third_f(data)
)
# A tibble: 3 × 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
1 first_f(data) 875.4ms 875.4ms 1.14 231.98MB
2 second_f(data) 11.7ms 12.2ms 80.2 1.17MB
3 third_f(data) 378ms 382.6ms 2.61 1.78MB