Maximum Lag-Value of 8 quarters

Hi there,

I want to calculate the maximum lag value of the eight quarters before. Since my solution is long and wrong, I wanted to ask you people for some ideas.

library(dplyr)

So here is a alternated dataset of one id. As you can see I do not have information for every quarter for every id. In this case 1993/4 is missing.

df <- tibble::tibble(
  id = c(rep(1,14)),
  year = c(rep(1990,4),rep(1991,4),rep(1992,3),rep(1993,3)),
  quarter = c(rep(seq(1:4),2),rep(seq(1:3),2)),
  var1 = c(rep(1,14)),
  var2 = c(rep(0,2),rep(1,12)),
  var3 = c(rep(0,14)),
  var4 = c(rep(0,14)),
  var5 = c(rep(0,10),1,0,0,1),
  quarter_num = c(1:11,13:15)
)

I am interested if var1:var5 (will build the loop later) occurred in the past 8 quarters, but I can´t get my head around how to handle the missing quarter correctly:

df2 <- df %>% group_by(id) %>% 
  mutate(lag8 = ifelse(quarter_num-lag(quarter_num,8)-8 <= 0 | is.na(quarter_num-lag(quarter_num,8)-8) ,lag(var2,8),NA),
         lag7 = ifelse(quarter_num-lag(quarter_num,7)-7 <= 0 | is.na(quarter_num-lag(quarter_num,7)-7) ,lag(var2,7),NA),
         lag6 = ifelse(quarter_num-lag(quarter_num,6)-6 <= 0 | is.na(quarter_num-lag(quarter_num,6)-6) ,lag(var2,6),NA),
         lag5 = ifelse(quarter_num-lag(quarter_num,5)-5 <= 0 | is.na(quarter_num-lag(quarter_num,5)-5) ,lag(var2,5),NA),
         lag4 = ifelse(quarter_num-lag(quarter_num,4)-4 <= 0 | is.na(quarter_num-lag(quarter_num,4)-4) ,lag(var2,4),NA),
         lag3 = ifelse(quarter_num-lag(quarter_num,3)-3 <= 0 | is.na(quarter_num-lag(quarter_num,3)-3) ,lag(var2,3),NA),
         lag2 = ifelse(quarter_num-lag(quarter_num,2)-2 <= 0 | is.na(quarter_num-lag(quarter_num,2)-2) ,lag(var2,2),NA),
         lag1 = ifelse(quarter_num-lag(quarter_num,1)-1 <= 0 | is.na(quarter_num-lag(quarter_num,1)-1) ,lag(var2,1),NA),
         lag_var2 = pmax(lag1,lag2,lag3,lag4,lag5,lag6,lag7,lag8, na.rm = TRUE))

Any ideas or dplyr magic?
Thank you!

Hello,

Although I do not fully understand what you mean by "lag-value of 8 quarters", based off what I could understand, I've come up with this for now:

library(tidyverse)

df <- tibble::tibble(
  id = c(rep(1,14)),
  year = c(rep(1990,4),rep(1991,4),rep(1992,3),rep(1993,3)),
  quarter = c(rep(seq(1:4),2),rep(seq(1:3),2)),
  var1 = c(rep(1,14)),
  var2 = c(rep(0,2),rep(1,12)),
  var3 = c(rep(0,14)),
  var4 = c(rep(0,14)),
  var5 = c(rep(0,10),1,0,0,1),
  quarter_num = c(1:11,13:15)
)

#Create a complete dataset for year and quarter
completeData = data.frame(
  year = min(df$year):max(df$year),
  quarter = rep(1:4, each = length(min(df$year):max(df$year)))
) %>% arrange(year, quarter)

completeData
#>    year quarter
#> 1  1990       1
#> 2  1990       2
#> 3  1990       3
#> 4  1990       4
#> 5  1991       1
#> 6  1991       2
#> 7  1991       3
#> 8  1991       4
#> 9  1992       1
#> 10 1992       2
#> 11 1992       3
#> 12 1992       4
#> 13 1993       1
#> 14 1993       2
#> 15 1993       3
#> 16 1993       4

#Copy-paste for each ID
completeData = map_df(unique(df$id), function(myId){
  completeData %>% mutate(id = myId)
})

completeData
#>    year quarter id
#> 1  1990       1  1
#> 2  1990       2  1
#> 3  1990       3  1
#> 4  1990       4  1
#> 5  1991       1  1
#> 6  1991       2  1
#> 7  1991       3  1
#> 8  1991       4  1
#> 9  1992       1  1
#> 10 1992       2  1
#> 11 1992       3  1
#> 12 1992       4  1
#> 13 1993       1  1
#> 14 1993       2  1
#> 15 1993       3  1
#> 16 1993       4  1

#Join the rest of data (but crop missing quarters at start and end)
completeData = completeData %>% 
  left_join(df, by = c("year", "quarter", "id")) %>% 
  filter(between(row_number(), min(quarter_num, na.rm = T), 
                 max(quarter_num, na.rm = T)))

completeData
#>    year quarter id var1 var2 var3 var4 var5 quarter_num
#> 1  1990       1  1    1    0    0    0    0           1
#> 2  1990       2  1    1    0    0    0    0           2
#> 3  1990       3  1    1    1    0    0    0           3
#> 4  1990       4  1    1    1    0    0    0           4
#> 5  1991       1  1    1    1    0    0    0           5
#> 6  1991       2  1    1    1    0    0    0           6
#> 7  1991       3  1    1    1    0    0    0           7
#> 8  1991       4  1    1    1    0    0    0           8
#> 9  1992       1  1    1    1    0    0    0           9
#> 10 1992       2  1    1    1    0    0    0          10
#> 11 1992       3  1    1    1    0    0    1          11
#> 12 1992       4  1   NA   NA   NA   NA   NA          NA
#> 13 1993       1  1    1    1    0    0    0          13
#> 14 1993       2  1    1    1    0    0    0          14
#> 15 1993       3  1    1    1    0    0    1          15

#Look at presence of vars in last 8 quarters
completeData %>% group_by(id) %>% slice((n() - 7):n()) %>% 
  summarise(across(c(var1:var5), function(x){
    any(x == 1, na.rm = T)
  }))
#> # A tibble: 1 x 6
#>      id var1  var2  var3  var4  var5 
#>   <dbl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1     1 TRUE  TRUE  FALSE FALSE TRUE

Created on 2021-08-25 by the reprex package (v2.0.1)

Hope this helps,
PJ

Thank you!
Except from the last part (where I have to clarify something) it worked for me. I only added one groub_by()-line for the cropping part:

completeData = completeData %>% 
  left_join(df, by = c("year", "quarter", "id")) %>%  
  group_by(id) %>%
  filter(between(row_number(), min(quarter_num, na.rm = T), 
                 max(quarter_num, na.rm = T)))

For the last part there is the question what I meant with "lag-value of 8 quarters". My intention was to store (mutate or whatever) what your code printed for the last line, but for each line.

completeData %>% group_by(id) %>% slice((n() - 8):n()-1) %>% 
  summarise(across(c(var1:var5), function(x){
    any(x == 1, na.rm = T)
  }))

     id var1  var2  var3  var4  var5 
  <int> <lgl> <lgl> <lgl> <lgl> <lgl>
1     9 TRUE  TRUE  FALSE FALSE TRUE 

Something like this:
I think it would be a mutate_at code, but don´t know how to include the sclicing part. It´s meant like has id:1 eaten fish in the 8 quarters before the current.

     id year  quarter var1  var2  var3  var4  var5 
  <int> <int> <int>   <lgl> <lgl> <lgl> <lgl> <lgl>
1     9 1990        1 NA    NA    NA    NA    NA
2     9 1990        2 TRUE  FALSE FALSE FALSE FALSE
2     9 1990        3 TRUE  FALSE FALSE FALSE FALSE
2     9 1990        2 TRUE  TRUE  FALSE FALSE FALSE
...
15    9 1993        3 TRUE  TRUE  FALSE FALSE TRUE

maybe something like this (doesn´t work):

completeData %>% group_by(id) %>%
    mutate_at(vars(c(var1:var5)), function(x){
         any(c(lag(x,8),lag(x,7),lag(x,6),lag(x,5),lag(x,4),lag(x,3),lag(x,2),lag(x,1)) == 1, na.rm = T)
  })

or

completeData %>% group_by(id) %>%
  mutate_at(vars(var1:var5), function(x){
    max(lag(x,8),lag(x,7),lag(x,6),lag(x,5),lag(x,4),lag(x,3),lag(x,2),lag(x,1), na.rm = T)
  })

Thank you, again!

Hi again,

I feel my code might not be the most optimal, but I do get the correct solution (I think) and it's running fast for the very small example, but will take a bit for a large dataset I think.

library(tidyverse)

df <- tibble::tibble(
  id = c(rep(1,14)),
  year = c(rep(1990,4),rep(1991,4),rep(1992,3),rep(1993,3)),
  quarter = c(rep(seq(1:4),2),rep(seq(1:3),2)),
  var1 = c(rep(1,14)),
  var2 = c(rep(0,2),rep(1,12)),
  var3 = c(rep(0,14)),
  var4 = c(rep(0,14)),
  var5 = c(rep(0,10),1,0,0,1),
  quarter_num = c(1:11,13:15)
)

#Create a complete dataset for year and quarter
completeData = data.frame(
  year = min(df$year):max(df$year),
  quarter = rep(1:4, each = length(min(df$year):max(df$year)))
) %>% arrange(year, quarter)

completeData
#>    year quarter
#> 1  1990       1
#> 2  1990       2
#> 3  1990       3
#> 4  1990       4
#> 5  1991       1
#> 6  1991       2
#> 7  1991       3
#> 8  1991       4
#> 9  1992       1
#> 10 1992       2
#> 11 1992       3
#> 12 1992       4
#> 13 1993       1
#> 14 1993       2
#> 15 1993       3
#> 16 1993       4

#Copy-paste for each ID
completeData = map_df(unique(df$id), function(myId){
  completeData %>% mutate(id = myId)
})

completeData
#>    year quarter id
#> 1  1990       1  1
#> 2  1990       2  1
#> 3  1990       3  1
#> 4  1990       4  1
#> 5  1991       1  1
#> 6  1991       2  1
#> 7  1991       3  1
#> 8  1991       4  1
#> 9  1992       1  1
#> 10 1992       2  1
#> 11 1992       3  1
#> 12 1992       4  1
#> 13 1993       1  1
#> 14 1993       2  1
#> 15 1993       3  1
#> 16 1993       4  1

#Join the rest of data (but crop missing quarters at start and end)
completeData = completeData %>% 
  left_join(df, by = c("year", "quarter", "id")) %>%
  group_by(id) %>% 
  filter(between(row_number(), min(quarter_num, na.rm = T), 
                 max(quarter_num, na.rm = T))) %>% 
  ungroup()

completeData
#> # A tibble: 15 x 9
#>     year quarter    id  var1  var2  var3  var4  var5 quarter_num
#>    <dbl>   <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>       <int>
#>  1  1990       1     1     1     0     0     0     0           1
#>  2  1990       2     1     1     0     0     0     0           2
#>  3  1990       3     1     1     1     0     0     0           3
#>  4  1990       4     1     1     1     0     0     0           4
#>  5  1991       1     1     1     1     0     0     0           5
#>  6  1991       2     1     1     1     0     0     0           6
#>  7  1991       3     1     1     1     0     0     0           7
#>  8  1991       4     1     1     1     0     0     0           8
#>  9  1992       1     1     1     1     0     0     0           9
#> 10  1992       2     1     1     1     0     0     0          10
#> 11  1992       3     1     1     1     0     0     1          11
#> 12  1992       4     1    NA    NA    NA    NA    NA          NA
#> 13  1993       1     1     1     1     0     0     0          13
#> 14  1993       2     1     1     1     0     0     0          14
#> 15  1993       3     1     1     1     0     0     1          15

#Look at presence of vars in last 8 quarters
yearQuarter = completeData %>% select(year, quarter) %>% distinct()

#For each quarter, look per id for the 8 previous ones (if present)
result = map_df(1:nrow(yearQuarter), function(i){
  
  completeData %>% 
    filter(year <= yearQuarter$year[i], 
           quarter <= yearQuarter$quarter[i] | year < yearQuarter$year[i]) %>% 
    group_by(id) %>% slice(max(n() - 7, 1):n()) %>% 
    summarise(
      year = year[n()],
      quarter = quarter[n()],
      quarter_num = quarter_num[n()],
      if(is.na(var1[n()])){
        across(c(var1:var5), ~ NA)
      } else {
        across(c(var1:var5), function(x){
          any(x == 1, na.rm = T)
        })
      }      ,
      .groups = "drop")
  
}) %>% arrange(id, year, quarter)

result
#> # A tibble: 15 x 9
#>       id  year quarter quarter_num var1  var2  var3  var4  var5 
#>    <dbl> <dbl>   <int>       <int> <lgl> <lgl> <lgl> <lgl> <lgl>
#>  1     1  1990       1           1 TRUE  FALSE FALSE FALSE FALSE
#>  2     1  1990       2           2 TRUE  FALSE FALSE FALSE FALSE
#>  3     1  1990       3           3 TRUE  TRUE  FALSE FALSE FALSE
#>  4     1  1990       4           4 TRUE  TRUE  FALSE FALSE FALSE
#>  5     1  1991       1           5 TRUE  TRUE  FALSE FALSE FALSE
#>  6     1  1991       2           6 TRUE  TRUE  FALSE FALSE FALSE
#>  7     1  1991       3           7 TRUE  TRUE  FALSE FALSE FALSE
#>  8     1  1991       4           8 TRUE  TRUE  FALSE FALSE FALSE
#>  9     1  1992       1           9 TRUE  TRUE  FALSE FALSE FALSE
#> 10     1  1992       2          10 TRUE  TRUE  FALSE FALSE FALSE
#> 11     1  1992       3          11 TRUE  TRUE  FALSE FALSE TRUE 
#> 12     1  1992       4          NA NA    NA    NA    NA    NA   
#> 13     1  1993       1          13 TRUE  TRUE  FALSE FALSE TRUE 
#> 14     1  1993       2          14 TRUE  TRUE  FALSE FALSE TRUE 
#> 15     1  1993       3          15 TRUE  TRUE  FALSE FALSE TRUE

Created on 2021-08-25 by the reprex package (v2.0.1)

I tested it for datasets with multiple ids (and potentially different missing data points) and it seems to hold up

Hope this helps,
PJ

This is great, thanks for your help! I really appreciate it!
Also, after a year with R, I finally get a glimpse in how functions work. I will definitely look into this topic.

I have made some minor changes, though.
I altered the if-statement. Now it gets NA for the first row because we do not have previous data and the rows in between get values because we do have previous data. Also I changed x in the any-statement to any(x[1:length(x)-1] to only include previous data.

#For each quarter, look per id for the 8 previous ones (if present)
result = map_df(1:nrow(yearQuarter), function(i){
  
  completeData3 %>% 
    filter(year <= yearQuarter$year[i], 
           quarter <= yearQuarter$quarter[i] | year < yearQuarter$year[i]) %>% 
    group_by(id) %>% slice(max(n() - 8, 1):n()) %>% 
    summarise(
      year = year[n()],
      quarter = quarter[n()],
      quarter_num = quarter_num[n()],
      if(n()==1){ 
          across(c(diagn_1:diagn_5), ~ NA)
      } else {
        across(c(diagn_1:diagn_5), function(x){
          any(x[1:length(x)-1] == 1, na.rm = T)
        })
      } ,
      .groups = "drop")
  
}) %>% arrange(id, year, quarter)

The test-run looked good, but as you said, for my dataset I guess it will take the night.
Thanks for all!
Rapha

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.