dplyr/vectorised approach to computing on lagged value

ok, here is my solution,
I made adjustment to the original 'slow solution' that I felt were required for the algorithm to be correct and relatively efficient in its way of working. I then used Rcpp to make a custom C++ function which is like cumulative sum except throttled to positive number results only.
When I benchmark the two approaches (and ignore the Rcpp function compilation time (which might be discountable if you store it and load it)) then the new approach is about 500 times faster...

library(tidyverse)

roll_fun_long <- function(., ...) {
  for (i in 2:nrow(.)) {
    .[i, "vol_c"] <- max(0, .[[i - 1, "vol_c"]] - 500 + .[[i - 1, "v_1"]])

    # .[i, "vol_c"] <- ifelse((.[i-1, "vol_c"] - 500)  < 0,
    #                         0 + .[i, "vol_c"],
    #                         .[i-1, "vol_c"] - 500+ .[i, "v_1"])
  }
  return(.)
}

sample_data0 <- data.frame(
  stringsAsFactors = FALSE,
  id = c("A", "A", "A", "A", "B", "B", "B", "B", "C", "C"),
  date = c(
    "2010-01-01",
    "2010-01-05",
    "2010-02-01", "2010-05-01", "2010-01-01", "2010-03-15",
    "2010-04-02", "2010-04-08", "2010-04-03", "2010-05-18"
  ),
  v_1 = c(5000, 1000, 9800, 3000, 0, 7000, 8000, 9000, 4000, 9000)
) %>% mutate(vol_c = 0)



sample_data <- mutate(sample_data0,
  date = as.Date(date)
) %>%
  group_by(id) %>%
  complete(id,
    date = seq.Date(min(date),
      max(date),
      by = "day"
    ),
    fill = list(
      v_1 = 0,
      vol_c = 0
    )
  )




library("Rcpp")
cppFunction("
NumericVector cumsum_only_pos(NumericVector x){
 	// initialize an accumulator variable
 	double acc = 0;
 	
 	// initialize the result vector
 	NumericVector res(x.size());
 	
 	for(int i = 0; i < x.size(); i++){
 		acc += x[i];
 		acc = std::max(acc,0.0);
 		res[i] = acc;
 	}
 	return res;
 }")
# cumsum_only_pos(c(25,0,10,1)-10)
# cumsum(c(25,0,10,1) -10)


microbenchmark::microbenchmark(
  orig_way = orig_result <- sample_data %>% split(.$id) %>% map_df(roll_fun_long),
  new_way = new_result <- sample_data %>% mutate(vol_c = cumsum_only_pos(replace_na(lag(v_1), 0) - 500)),
  times = 8L
)

all_equal(new_result, orig_result)
1 Like