Retrieve subsequent item from group using dplyr

noobie · November 2, 2020, 7:46pm

I have data which looks like

library(dplyr)
library(lubridate)
Date_Construct= c("10/03/2018 00:00", "10/03/2018 00:00","01/01/2016 00:00","21/03/2015 01:25", "21/03/2015 01:25",  "17/04/2016 00:00","17/04/2016 00:00", "20/02/2012 00:00","20/02/2020 00:00")
Date_first_use = c("02/08/2018 00:00","02/08/2018 00:00", "01/04/2016 00:00","NA", "NA", "NA", "NA","13/08/2012 00:00","20/04/2020 00:00")
Date_fail = c("02/08/2019 00:00","02/08/2019 00:00", "21/06/2018 06:42","NA" , "NA" , "17/04/2016 00:00", "17/04/2016 00:00","13/08/2014 07:45","NA")
P_ID = c("0001", "0001" ,"0001" ,"0001", "0001","34000","34000","34000", "00425")
Comp_date= c("16/05/2019 00:00", "10/04/2018 12:55","25/06/2017 00:00","22/04/2015 00:00","08/05/2015 00:00" ,"04/05/2017 00:00" ,"15/07/2016 00:00","01/03/2014 00:00", "20/03/2020 00:00")
Type =  c("a","a","b","c","c","b","b","a","c")
Date_Construct= dmy_hm(Date_Construct)

dfq= data.frame(`P_ID`, `Type`, `Date_Construct`, `Date_first_use`,`Date_fail`, `Comp_date`)%>%
  arrange(P_ID, desc(Date_Construct))%>%
  group_by( P_ID,  Date_Construct, Type)%>%
  mutate(A_ID= cur_group_id())%>%
  select(P_ID,A_ID,Type, Date_Construct, Date_first_use, Date_fail, Comp_date)%>%
  mutate(across(contains("Date", ignore.case = TRUE), dmy_hm))  

View(dfq)

It is a data frame of different items (A_ID) of type a/b/c, created for different clients (P_ID), with date of construction, date of first use and date of failure. Each P_ID may have multiple A_ID, and each A_ID may have multiple Comp_date.

I need to supply a date for where Date_fail is NA, which is the Date_construct of the next constructed A_ID for the same P_ID.

i.e. Date_fail for P_ID 0001, A_ID 1 should be 2016-01-01 00:00:00.

For A_ID which there are no subsequent A_ID (as is the case for P_ID 00425, A_ID 4), the Date_fail should remain NA .
So result should look like:

  P_ID  A_ID Type      Date_Construct      Date_first_use Date_fail            Comp_date                   
1 0001	1	   c	   2015-03-21 01:25:00	NA	           2016-01-01 00:00:00	2015-04-22 00:00:00	 	      
2 0001	1	   c	   2015-03-21 01:25:00	NA	           2016-01-01 00:00:00	2015-05-08 00:00:00	 	      
3 0001	2	   b	   2016-01-01 00:00:00	2016-04-01	   2018-06-21 06:42:00	2017-06-25 00:00:00	       
4 0001	3	   a	   2018-03-10 00:00:00	2018-08-02	   2019-08-02 00:00:00	2019-05-16 00:00:00	 	      
5 0001	3	   a	   2018-03-10 00:00:00	2018-08-02	   2019-08-02 00:00:00	2018-04-10 12:55:00	      	
6 00425	4      c	   2020-02-20 00:00:00	2020-04-20	   NA                   2020-03-20 00:00:00	 	      
7 34000	5	   a	   2012-02-20 00:00:00	2012-08-13	   2014-08-13 07:45:00	2014-03-01 00:00:00	 	      
8 34000	6	   b	   2016-04-17 00:00:00	NA	           2016-04-17 00:00:00	2017-05-04 00:00:00	 	      
9 34000	6	   b	   2016-04-17 00:00:00	NA	           2016-04-17 00:00:00	2016-07-15 00:00:00

I tried this, which I thought worked, but it is just given me the Date_Construct of the next row in the group, which isn't correct as some A_ID have multiple entries:

  arrange(P_ID, Date_Construct)%>%
  group_by(P_ID,) %>%
  mutate(Date_fail2 = sort(Date_Construct, decreasing = FALSE)[row_number(Date_Construct) + 1])%>%
  mutate(Date_fail = if_else( is.na(Date_fail), paste(Date_fail2), paste(Date_fail)))

I'm ideally looking for a dplyr/tidyverse solution as I find them easier to understand and reproduce.

technocrat · November 2, 2020, 11:14pm

I had a difficult time following the logic. The snippet below represents the object that I think you would benefit working from

suppressPackageStartupMessages({
  library(dplyr)
  library(lubridate)
  })
Date_construct <- dmy_hm(c("10/03/2018 00:00", "10/03/2018 00:00", "01/01/2016 00:00", "21/03/2015 01:25", "21/03/2015 01:25", "17/04/2016 00:00", "17/04/2016 00:00", "20/02/2012 00:00", "20/02/2020 00:00"))
Date_first_use <- dmy_hm(c("02/08/2018 00:00", "02/08/2018 00:00", "01/04/2016 00:00", NA, NA, NA, NA, "13/08/2012 00:00", "20/04/2020 00:00"))
Date_fail <- dmy_hm(c("02/08/2019 00:00", "02/08/2019 00:00", "21/06/2018 06:42", NA, NA, "17/04/2016 00:00", "17/04/2016 00:00", "13/08/2014 07:45", NA))
P_ID <- c("0001", "0001", "0001", "0001", "0001", "34000", "34000", "34000", "00425")
Date_comp <- dmy_hm(c("16/05/2019 00:00", "10/04/2018 12:55", "25/06/2017 00:00", "22/04/2015 00:00", "08/05/2015 00:00", "04/05/2017 00:00", "15/07/2016 00:00", "01/03/2014 00:00", "20/03/2020 00:00"))
Type <- c("a", "a", "b", "c", "c", "b", "b", "a", "c")

dfq <- data.frame(P_ID, Type, Date_construct, Date_first_use, Date_fail, Date_comp)

dfq %>% 
  group_by(P_ID) %>%
  mutate(A_ID = cur_group_id()) %>%
  select(P_ID, A_ID, everything(), -Date_comp)
#> # A tibble: 9 x 6
#> # Groups:   P_ID [3]
#>   P_ID   A_ID Type  Date_construct      Date_first_use      Date_fail          
#>   <chr> <int> <chr> <dttm>              <dttm>              <dttm>             
#> 1 0001      1 a     2018-03-10 00:00:00 2018-08-02 00:00:00 2019-08-02 00:00:00
#> 2 0001      1 a     2018-03-10 00:00:00 2018-08-02 00:00:00 2019-08-02 00:00:00
#> 3 0001      1 b     2016-01-01 00:00:00 2016-04-01 00:00:00 2018-06-21 06:42:00
#> 4 0001      1 c     2015-03-21 01:25:00 NA                  NA                 
#> 5 0001      1 c     2015-03-21 01:25:00 NA                  NA                 
#> 6 34000     3 b     2016-04-17 00:00:00 NA                  2016-04-17 00:00:00
#> 7 34000     3 b     2016-04-17 00:00:00 NA                  2016-04-17 00:00:00
#> 8 34000     3 a     2012-02-20 00:00:00 2012-08-13 00:00:00 2014-08-13 07:45:00
#> 9 00425     2 c     2020-02-20 00:00:00 2020-04-20 00:00:00 NA

^{Created on 2020-11-02 by the reprex package (v0.3.0.9001)}

AlexisW · November 3, 2020, 10:21pm

First, an assumption that you seem to make is that all {P_ID,A_ID} combinations have the same Date_fail. If that's not the case, you'll have to add a few steps.

Then, your first solution is a good idea (at least it was also my first idea), but as technocrat pointed out there is something that doesn't work if I just copy/paste your code. Also, you can use the function dplyr::lead() to simplify your selection of the next row:

Date_Construct <- dmy_hm(c("10/03/2018 00:00", "10/03/2018 00:00", "01/01/2016 00:00", "21/03/2015 01:25", "21/03/2015 01:25", "17/04/2016 00:00", "17/04/2016 00:00", "20/02/2012 00:00", "20/02/2020 00:00"))
Date_fail <- dmy_hm(c("02/08/2019 00:00", "02/08/2019 00:00", "21/06/2018 06:42", NA, NA, "17/04/2016 00:00", "17/04/2016 00:00", "13/08/2014 07:45", NA))
P_ID <- c("0001", "0001", "0001", "0001", "0001", "34000", "34000", "34000", "00425")
Type <- c("a", "a", "b", "c", "c", "b", "b", "a", "c")

dfq <- data.frame(P_ID, Type, Date_Construct,Date_fail) %>%
  arrange(P_ID, desc(Date_Construct)) %>%
  group_by( P_ID,  Date_Construct, Type)%>%
  mutate(A_ID= cur_group_id()) %>%
  select(P_ID,A_ID,Type, Date_Construct, Date_fail)

dfq %>%
  group_by(P_ID) %>%
  arrange(P_ID,A_ID) %>%
  mutate(Date_fail2 = if_else(is.na(Date_fail), lead(Date_fail), Date_fail))
# A tibble: 9 x 6
# Groups:   P_ID [3]
#  P_ID   A_ID Type  Date_Construct      Date_fail           Date_fail2         
#   <chr> <int> <chr> <dttm>              <dttm>              <dttm>             
# 1 0001      1 c     2015-03-21 01:25:00 NA                  NA                 
# 2 0001      1 c     2015-03-21 01:25:00 NA                  2018-06-21 06:42:00
# 3 0001      2 b     2016-01-01 00:00:00 2018-06-21 06:42:00 2018-06-21 06:42:00
# 4 0001      3 a     2018-03-10 00:00:00 2019-08-02 00:00:00 2019-08-02 00:00:00
# 5 0001      3 a     2018-03-10 00:00:00 2019-08-02 00:00:00 2019-08-02 00:00:00
# 6 00425     4 c     2020-02-20 00:00:00 NA                  NA                 
# 7 34000     5 a     2012-02-20 00:00:00 2014-08-13 07:45:00 2014-08-13 07:45:00
# 8 34000     6 b     2016-04-17 00:00:00 2016-04-17 00:00:00 2016-04-17 00:00:00
# 9 34000     6 b     2016-04-17 00:00:00 2016-04-17 00:00:00 2016-04-17 00:00:00

Now, to solve that problem, let's make the data.frame more compact: let's make a data.frame that has a row for each combination of {P_ID,A_ID}, that way we can simply take the next line:

dfq %>%
  ungroup() %>%
  select(P_ID, A_ID, Date_fail) %>%
  distinct() %>%
  group_by(P_ID) %>%
  arrange(P_ID, A_ID) %>%
  mutate(Date_fail2 = if_else(is.na(Date_fail), lead(Date_fail), Date_fail))
# A tibble: 6 x 4
# Groups:   P_ID [3]
#   P_ID   A_ID Date_fail           Date_fail2         
#   <chr> <int> <dttm>              <dttm>             
# 1 0001      1 NA                  2018-06-21 06:42:00
# 2 0001      2 2018-06-21 06:42:00 2018-06-21 06:42:00
# 3 0001      3 2019-08-02 00:00:00 2019-08-02 00:00:00
# 4 00425     4 NA                  NA                 
# 5 34000     5 2014-08-13 07:45:00 2014-08-13 07:45:00
# 6 34000     6 2016-04-17 00:00:00 2016-04-17 00:00:00

And now we can join it back with the original dfq, replacing the existing Date_fail column with the new, completed, one:

dfq %>%
  ungroup() %>%
  select(P_ID, A_ID, Date_fail) %>%
  distinct() %>%
  group_by(P_ID) %>%
  arrange(P_ID, A_ID) %>%
  mutate(Date_fail2 = if_else(is.na(Date_fail), lead(Date_fail), Date_fail)) %>%
  right_join(dfq, by=c("P_ID","A_ID")) %>%
  select(-Date_fail.x, -Date_fail.y, Date_fail = Date_fail2)
# A tibble: 9 x 5
# Groups:   P_ID [3]
#   P_ID   A_ID Date_fail           Type  Date_Construct     
#   <chr> <int> <dttm>              <chr> <dttm>             
# 1 0001      1 2018-06-21 06:42:00 c     2015-03-21 01:25:00
# 2 0001      1 2018-06-21 06:42:00 c     2015-03-21 01:25:00
# 3 0001      2 2018-06-21 06:42:00 b     2016-01-01 00:00:00
# 4 0001      3 2019-08-02 00:00:00 a     2018-03-10 00:00:00
# 5 0001      3 2019-08-02 00:00:00 a     2018-03-10 00:00:00
# 6 00425     4 NA                  c     2020-02-20 00:00:00
# 7 34000     5 2014-08-13 07:45:00 a     2012-02-20 00:00:00
# 8 34000     6 2016-04-17 00:00:00 b     2016-04-17 00:00:00
# 9 34000     6 2016-04-17 00:00:00 b     2016-04-17 00:00:00

(I know, I'm a monster who uses right_join(), but that's only out of laziness to not save the intermediary table, you might want to do that in a cleaner way; also make really sure that the number of rows at the end is the same as in dfq, possibly compare the content of Date_fail.x and Date_fail.y to make sure nothing went wrong)

noobie · November 4, 2020, 5:59pm

AlexisW:

dfq %>%
  ungroup() %>%
  select(P_ID, A_ID, Date_fail) %>%
  distinct() %>%
  group_by(P_ID) %>%
  arrange(P_ID, A_ID) %>%
  mutate(Date_fail2 = if_else(is.na(Date_fail), lead(Date_fail), Date_fail)) %>%
  right_join(dfq, by=c("P_ID","A_ID")) %>%
  select(-Date_fail.x, -Date_fail.y, Date_fail = Date_fail2)

Thank you. That worked. though needed to switch lead(Date_fail) to lead(Date_consruct):

dfq1 = dfq %>%
  ungroup() %>%
  select(P_ID, A_ID, Date_fail, Date_Construct) %>%
  distinct() %>%
  group_by(P_ID) %>%
  arrange(P_ID, A_ID) %>%
  mutate(Date_fail2 = if_else(is.na(Date_fail), lead(Date_Construct), Date_fail)) %>%
  left_join(dfq, by=c("P_ID","A_ID")) %>%
  select(-Date_fail.x, -Date_fail.y, Date_fail = Date_fail2, Date_Construct = Date_Construct.x, -Date_Construct.y)

I also found another solution, which is perhaps a bit neater, which nests the unused variables:

nested = dfq %>%
  ungroup() %>%
  arrange(P_ID, A_ID) %>%
  nest(extra = Comp_date)
dfq1= nested %>%
  group_by(P_ID) %>%
  mutate(Date_fail = coalesce(Date_fail, lead(Date_Construct))) %>%
  unnest(extra)

Both worked with the full data set

system · November 11, 2020, 5:59pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.