I was also experimenting with joins, the problem is that on the cases where the periods overlap (one ends and the other begins) the join will duplicate rows. In this case, df_2_update has 24 rows (1994 duplicates) and the loop approach preserves row number.
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
# data
df_1 <- tibble(
start = rep(seq(1990, 1994, 4), each = 2),
end = start + 4,
countryname = rep(c("SWE", "NOR"), 2),
group_id = rep(seq(1:2), each = 2)
)
df_2 <- tibble(
year = rep(seq(1989, 1999, 1), each = 2),
countryname = rep(c("SWE", "NOR"), 11),
value = rep(seq(100, 110, 1), each = 2),
group_id = NA_real_
)
# original approach
for (i in (1:nrow(df_1))) {
for (j in (1:nrow(df_2))) {
if (df_1[i, "countryname"] == df_2[j, "countryname"]) {
if (df_2[j, "year"] >= df_1[i, "start"] & df_2[j, "year"] <= df_1[i, "end"]) {
df_2[j, "group_id"] <- df_1[i, "group_id"]
}
}
}
}
# proposed alternative
df_1_long <- df_1 %>%
rowwise() %>%
mutate(year=list(start:end)) %>%
ungroup() %>%
unnest(year) %>%
select(-start, -end)
df_2_update <- df_2 %>%
select(-group_id) %>%
left_join(df_1_long, by=c("year", "countryname"))
# results
df_2 %>%
group_by(year) %>%
summarise(n())
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 11 x 2
#> year `n()`
#> <dbl> <int>
#> 1 1989 2
#> 2 1990 2
#> 3 1991 2
#> 4 1992 2
#> 5 1993 2
#> 6 1994 2
#> 7 1995 2
#> 8 1996 2
#> 9 1997 2
#> 10 1998 2
#> 11 1999 2
df_2_update %>%
group_by(year) %>%
summarise(n())
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 11 x 2
#> year `n()`
#> <dbl> <int>
#> 1 1989 2
#> 2 1990 2
#> 3 1991 2
#> 4 1992 2
#> 5 1993 2
#> 6 1994 4
#> 7 1995 2
#> 8 1996 2
#> 9 1997 2
#> 10 1998 2
#> 11 1999 2
Created on 2021-01-12 by the reprex package (v0.3.0)