Hi NargizB ,
The following is not completely fulfilling your requirements, but I created the following approach so that you can see step-by-step how the result is achieved. Hopefully this brings you forward by tweaking the code such that it completely satisfies your requirements.
library(tidyverse)
df <- tibble(client = c(1, 1, 1, 1, 1, 2, 2, 3, 3),
year_from = c(1993, 1998, 2000,2001, 2008, 2007, 2007, 2006, 2020),
year_to = c(1995, 2020, 2005, 2021, 2010, 2010, 2014, 2017, 2020),
)
df
#> # A tibble: 9 x 3
#> client year_from year_to
#> <dbl> <dbl> <dbl>
#> 1 1 1993 1995
#> 2 1 1998 2020
#> 3 1 2000 2005
#> 4 1 2001 2021
#> 5 1 2008 2010
#> 6 2 2007 2010
#> 7 2 2007 2014
#> 8 3 2006 2017
#> 9 3 2020 2020
df %>%
## ensure that data is sorted correctly
arrange(client, year_from, year_to) %>%
group_by(client) %>%
mutate(## ensure that NA values are replaced with `FALSE`
y_from_overlapping = if_else(lag(year_from) <= year_from & year_from <= lag(year_to), TRUE, FALSE, missing = FALSE),
y_to_overlapping = if_else(lag(year_from) <= year_to & year_to <= lag(year_to), TRUE, FALSE, missing = FALSE),
## notice also the explicit use of `ifelse()` to bypass the error thrown when using `if_else()`
period_start = ifelse(y_from_overlapping == TRUE, NA, year_from),
period_end = ifelse(y_to_overlapping == TRUE, NA, year_to),
) %>%
## replace NA values with values from previous row
fill(period_start, period_end) %>%
## group per client the different periods
group_by(client, period_start) %>%
## select the first and last year for a specific period
mutate(period_min = min(period_start),
period_max = max(period_end),
) %>%
ungroup() %>%
select(client, year_from = period_min, year_to = period_max) %>%
## remove duplicate rows
distinct()
#> # A tibble: 5 x 3
#> client year_from year_to
#> <dbl> <dbl> <dbl>
#> 1 1 1993 1995
#> 2 1 1998 2021
#> 3 2 2007 2014
#> 4 3 2006 2017
#> 5 3 2020 2020
Created on 2021-02-19 by the reprex package (v1.0.0)
HTH