The set up is presented in a very 'abstract fashion' for my money, so at least for myself as a programmer, its hard to reason about. Also given that it uses data.table syntax which I'm not invested time in interpreting, some things seem very strange the (the nm3, where i. is appended to column names) not sure what that is doing /why.
But I tried a naive optimisation to process less data through the function, and for the example data, the results matched and it ran in half the time.
library(tidyverse)
library(lubridate)
library(data.table)
library(bench)
df1 <- data.frame( Id = rep(1:5, length=900),
date1 = as.Date( "2021-12-01"),
date2= rep(seq( as.Date("2021-01-01"), length.out=450, by=1), each = 2),
Category = rep(c("ABC", "EFG"), length.out = 900),
Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"), length.out = 900),
DR1 = sample( 200:250, 900, repl=TRUE),
setNames( replicate(365, { sample(0:900, 900)}, simplify=FALSE),
paste0("DRM0", formatC(1:365, width = 2, format = "d", flag = "0"))))
return_values <- function (df1,idd,dmda, CategoryChosse) {
# First idea: Calculate the median of the values resulting from the subtraction between DR1 and the values of the DRM columns
dt1 <- as.data.table(df1)
cols <- grep("^DRM0", colnames(dt1), value = TRUE)
med <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
# Second idea: After obtaining the median, I add the values found with the values of the DRM columns of my df1 database.
f2 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f2(names(df1), "^DRM0\\d+$")
nm2 <- f2(names(med), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[med,(nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Id, Category, Week)]
SPV <- df1[, c('Id','date1', 'date2', 'Week','Category', nm2), with = FALSE]#%>%data.frame
# Third idea: Coef values
coef<-SPV %>%
filter(Id==idd, date2 == ymd(dmda), Category == CategoryChosse) %>%
pull(as.numeric(ymd(dmda)-ymd(min(df1$date1)))+6)
return(coef)
}
return_valuesX <- function (df1,idd,dmda, CategoryChosse) {
# First idea: Calculate the median of the values resulting from the subtraction between DR1 and the values of the DRM columns
dt1 <- as.data.table(df1)
num_to_pull <- as.numeric(ymd(dmda)-ymd(min(df1$date1)))+6
cols <- grep("^DRM0", colnames(dt1), value = TRUE)[1:num_to_pull]
med <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
# Second idea: After obtaining the median, I add the values found with the values of the DRM columns of my df1 database.
f2 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f2(names(df1), "^DRM0\\d+$")[1:num_to_pull]
nm2 <- f2(names(med), "_PV")[1:num_to_pull]
nm3 <- paste0("i.", nm2)[1:num_to_pull]
# df1 <- df1 %>% select(Id,Category,Week,date1,date2,all_of(nm1))
setDT(df1)[med,(nm2) := Map(`+`, mget(nm1), mget(nm3)), on = .(Id, Category, Week)]
SPV <- df1[, c('Id','date1', 'date2', 'Week','Category', nm2), with = FALSE]#%>%data.frame
# Third idea: Coef values
coef<-SPV %>%
filter(Id==idd, date2 == ymd(dmda), Category == CategoryChosse) %>%
pull(num_to_pull)
return(coef)
}
subset_df1 <- subset(df1, date2 > date1)
bench::mark(a=subset_df1 %>%
rowwise %>%
mutate(result=return_values(df1,Id, date2, Category)) %>%
select(-c(Week,starts_with('DR'))) ,
b=subset_df1 %>%
rowwise %>%
mutate(result=return_valuesX(df1,Id, date2, Category)) %>%
select(-c(Week,starts_with('DR'))) ,iterations = 10)
Timing Results
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 a 32.6s 34.4s 0.0291 8.74GB 1.18 10 406 5.72m <rowwise_df> <Rprofmem> <bench_tm> <tibble>
2 b 14.9s 15.5s 0.0640 2.22GB 0.832 10 130 2.6m <rowwise_df> <Rprofmem> <bench_tm> <tibble>