Hi there
I have a scenario where I have to do FIFO (First in, First out) parceling on investment transactions. Basically just purchases and sales. I need to determine how long it took for each purchase to be sold out.
I have code that works and gives me the correct answers (I think), but the problem is that it takes very long to run if I do it on all the data that I need it for. A position is a Portfolio and Instrument combination, and I start off with about 60,000 positions. This I then reduce to about 13,000 based on if the position had any holdings on or after a certain date. Not really relevant to the code I'm trying to improve.
The 13,000 positions that I'm left with has around 760,000 transactions, and it takes my code more than 2 hours to process. The data included in the reprex does not really test the speed of the code, so if anybody is interested in a much bigger sample set, please let me know how I can share the file.
I hope my explanation and code makes sense.
I'm really looking forward to see what the experts come up with.
Regards
Martin
Simple example:
P1 - Purchase - 100 units - day 1
S1 - Sale - 20 units - day 10
P2 - Purchase - 10 - day 15
S2 - Sale - 90 units - day 25
Results
P1 - S1 - 20 units - day 1 - day 10 - 9 days
P1 - S2 - 80 units - day 1 - day 25 - 24 days
P2 - S2 - 10 units - day 15 - day 25 - 10 days
library(tidyverse)
#trades_taxtrn <- readr::read_csv(here::here("data/trades_samp.csv"))
#trades_taxtrn <- trades_taxtrn %>% filter(portfolio_code == "BRHMFD", instrument_code %in% c("B01Y8L7")) %>% arrange(instrument_code, contract)
trades_taxtrn <- structure(list(portfolio_code = c("BRHMFD", "BRHMFD", "BRHMFD"
), instrument_code = c("B01Y8L7", "B01Y8L7", "B01Y8L7"), id_number = c(3034992,
3135096, 3171681), trade = c("P", "S", "S"), trans = c("03",
"04", "04"), short_name = c("OS-PUR", "OS-SAL", "OS-SAL"), contract = structure(c(17903,
18207, 18295), class = "Date"), units = c(604679.71, 371951,
232728.71), units_signed = c(604679.71, -371951, -232728.71),
units_remain = c(604679.71, -371951, -232728.71)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(portfolio_code = structure(list(), class = c("collector_character",
"collector")), instrument_code = structure(list(), class = c("collector_character",
"collector")), id_number = structure(list(), class = c("collector_double",
"collector")), trade = structure(list(), class = c("collector_character",
"collector")), trans = structure(list(), class = c("collector_character",
"collector")), short_name = structure(list(), class = c("collector_character",
"collector")), contract = structure(list(format = ""), class = c("collector_date",
"collector")), units = structure(list(), class = c("collector_double",
"collector")), units_signed = structure(list(), class = c("collector_double",
"collector")), units_remain = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
positions <- trades_taxtrn %>%
group_by(
portfolio_code, instrument_code
) %>%
nest()
.holding_date <- as.Date("2019-01-01")
check_holdings <- function(trades, .holding_date){
ret <- FALSE
units_pre <- sum(trades[trades$contract <= .holding_date, ] $units_signed, na.rm = TRUE)
pur_post <- nrow(trades[trades$contract > .holding_date, ])
if(abs(units_pre) > 1 | abs(pur_post) > 0){
ret <- TRUE
}
return(ret)
}
tictoc::tic()
positions <- positions %>%
mutate(
have_holdings = purrr::map_lgl(.x = data, .f = check_holdings, .holding_date)
)
positions <- positions %>% filter(have_holdings)
tictoc::toc()
#> 0.01 sec elapsed
# parcelling --------------------------------------------------------------
process_parcels <- function(trades, .pid){
purchases <- trades %>%
filter(units_signed > 0) %>%
select(contract, purchase_id = id_number, purchase_date = contract, purchase_units = units) %>%
arrange(purchase_date, purchase_id)
if(nrow(purchases) == 0){
return(NULL)
}
sales <- trades %>%
filter(units_signed < 0) %>%
select(contract, sale_id = id_number, sale_date = contract, sale_units = units, units_remain) %>%
arrange(sale_date, sale_id)
if(nrow(sales) == 0){
return(NULL)
}
purchase_sales <- tibble::tribble(
~purchase_id, ~purchase_date, ~purchase_units_cum, ~sale_id, ~sale_date, ~sale_units, ~sale_units_org, ~purchase_units_left,
1, as.Date(Sys.Date()), 1, 1, as.Date(Sys.Date()), 1, 1, 1
)
purchase_sales <- purchase_sales %>% filter(purchase_id != 1)
process_sale <- function(df, .sale_num, .sale_units){
df <- df %>%
arrange(purchase_date, purchase_id) %>%
mutate(
sale_units_org = .sale_units,
sale_x = -1*(purchase_units - (purchase_units_cum - .sale_units))
)
df_x <- df %>%
filter(
sale_x < 0
)
df_x <- df_x %>%
mutate(
sale_units = case_when(
.sale_units > purchase_units_cum ~ -1*purchase_units,
sale_x < 0 ~ sale_x,
TRUE ~ as.double(0)
),
purchase_units_left = purchase_units + sale_units
)
return(df_x)
}
xx <- purchases
i <- 1
for(i in 1:length(sales$sale_id)){
xx <- xx %>% mutate(
purchase_units_cum = cumsum(purchase_units),
rid = row_number()
)
#print(i)
.sale_units <- sales$sale_units[[i]]
.sale_id <- sales$sale_id[[i]]
.sale_num <- i
rids <- xx %>% filter(purchase_units_cum < .sale_units) %>% pull(rid)
if(length(rids) == 0){
row_i <- 1
} else {
row_i <- max(rids) + 1
}
qualified_purchases <- xx %>% filter(rid <= row_i)
df_p <- process_sale(df = qualified_purchases, .sale_num, .sale_units)
df_p$sale_id <- .sale_id
df_p$sale_date <- sales$sale_date[[i]]
df_p_s <- df_p %>% select(purchase_id, purchase_date, purchase_units_cum, sale_id, sale_date, sale_units, sale_units_org, purchase_units_left)
purchase_sales <- bind_rows(purchase_sales, df_p_s)
row_x <- max(df_p %>% pull(rid))
xx <- xx %>% filter(rid >= row_x)
df_p <- df_p %>% filter(purchase_units_left > 0)
if(nrow(df_p) == 0){
xx$purchase_units[[1]] <- 0
xx <- xx %>% filter(purchase_units > 0)
} else {
xx$purchase_units[[1]] <- df_p$purchase_units_left
}
}
return(purchase_sales)
}
# run process parcesl -----------------------------------------------------
tictoc::tic()
positions <- positions %>%
ungroup() %>%
mutate(
pid = row_number()
) %>%
mutate(
parcels = purrr::map2(.x = data, .y = pid, .f = process_parcels)
)
tictoc::toc()
#> 0.27 sec elapsed
positions %>% select(portfolio_code, instrument_code, parcels) %>% unnest(cols = parcels)
#> # A tibble: 2 x 10
#> portfolio_code instrument_code purchase_id purchase_date purchase_units_~
#> <chr> <chr> <dbl> <date> <dbl>
#> 1 BRHMFD B01Y8L7 3034992 2019-01-07 604680.
#> 2 BRHMFD B01Y8L7 3034992 2019-01-07 232729.
#> # ... with 5 more variables: sale_id <dbl>, sale_date <date>, sale_units <dbl>,
#> # sale_units_org <dbl>, purchase_units_left <dbl>
Created on 2020-09-15 by the reprex package (v0.3.0)