I think this worked, and I think it should scale nicely once I expand to the full database of ~1M rows.
(BTW, is there a better way to calculate capped_count_chg in one step?)
[Code edited to fix earlier error in capped_count.]
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
# Small sample
memberships <- tibble(
memberID = c("A", "A", "A", "B", "C"),
membershipID = 1:5 %>% as.factor,
start_date = ymd(c(20100101, 20101120, 20120415, 20110605, 20100210)),
end_date = ymd(c(20101231, 20111231, 20130430, 20120531, 20110228)),
mo_dur = interval(start_date, end_date) %>%
as.duration() / dyears() * 12)
# # Large randomized sample
# n <- 100
# memberships <- tibble(
# memberID = sample(LETTERS[1:26], n, replace=TRUE),
# membershipID = 1:n %>% as.factor,
# start_date = sample(seq(ymd(20100101), ymd(20171231), by="day"), n, replace=TRUE),
# end_date = start_date + runif(min = 350, max = 380, n=n),
# mo_dur = interval(start_date, end_date) %>%
# as.duration() / dyears() * 12)
memberships_capped <- memberships %>%
group_by(memberID) %>%
gather(status, date, start_date, end_date) %>%
mutate(count_chg = if_else(status == "start_date", 1, -1)) %>%
arrange(date) %>%
mutate(count = cumsum(count_chg),
capped_count = if_else(count > 1, 1, count),
capped_count_chg = capped_count - lag(capped_count),
capped_count_chg = if_else(is.na(capped_count_chg), 1, capped_count_chg)) %>%
ungroup()
memberships_timeline <- memberships_capped %>%
arrange(date) %>%
group_by(date) %>%
summarize(count_chg = sum(count_chg),
capped_count_chg = sum(capped_count_chg)) %>%
mutate(count = cumsum(count_chg),
capped_count = cumsum(capped_count_chg))
ggplot(memberships) +
geom_segment(aes(x = start_date, xend = end_date,
y = memberID, yend = memberID ),
arrow = arrow(length = unit(0.03, "npc"))) +
geom_point(aes(start_date, memberID)) +
geom_text(vjust = -0.5, hjust=0, size = 3,
aes(x = start_date, y = memberID,
label = paste(round(mo_dur, 2), "months")))

ggplot(memberships_timeline, aes(date)) +
geom_step(aes(y=count), lty="dashed") +
geom_step(aes(y=capped_count)) +
labs(subtitle="Membership count: dashed line\nUnique member count: solid line", y="", x="")
