Looping and subsetting long data

cactus · December 20, 2021, 10:50pm

I have a dataframe dfsales which I have converted into a long format : dfsaleslong.

dfsales <- data.frame(
  subjectid = c("a","b","c","d","e","f","g","h","i","j","k"),
  location = c("NY","NC","WA","WA","OR","CA","AR","KS","AZ","VT","MA"),
  month1 = c(NA,NA,1,0,0,2,1,1,0,0,0),
  month2 = c(NA,NA,0,0,0,0,NA,0,0,0,NA),
  month3 = c(NA,1,0,1,0,0,0,1,NA,NA,NA),
  month4 = c(0,0,0,0,0,1,2,0,1,NA,0),
  month5 = c(NA,NA,NA,NA,NA,NA,0,1,1,2,0),
  month6 = c(NA,NA,0,0,0,NA,NA,0,0,0,0),
  month7 = c(0,0,0,0,0,0,NA,0,0,0,0),
  goods1 = c(1,2,1,2,0,0,1,2,2,1,0),
  goods2 = c(0,0,1,2,1,1,2,2,1,0,0),
  goods3 = c(0,1,2,1,1,NA,2,1,2,1,NA),
  goods4 = c(0,1,2,1,1,1,2,2,NA,NA,NA),
  goods5 = c(0,1,0,1,1,1,2,2,1,NA,NA),
  goods6 = c(0,1,2,1,1,1,2,2,0,0,0),
  goods7 = c(NA,1,1,1,1,1,2,2,2,NA,NA),
  complain1 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain2 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain3 = c(0,1,0,0,0,0,1,0,0,0,0),
  complain4 = c(0,0,0,0,0,0,0,0,0,1,1),
  complain5 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain6 = c(0,0,0,0,0,0,1,0,0,1,0),
  complain7 = c(0,2,0,0,0,0,2,0,0,0,1))

library(tidyr)
#> Warning: package 'tidyr' was built under R version 3.6.2
dfsaleslong <- pivot_longer(data = dfsales,cols = month1:complain7,
             names_pattern = "([^\\d]+)(\\d+)",
             names_to = c(".value","month_number"))
Created on 2021-12-20 by the reprex package (v2.0.1)

the column "complain" has three values (0,1,2). I want to create a function where for any of the subject, if the value of complain is either 1 or 2 (which means they have received complain), I want to delete and ignore all the data/rows after that for that individual.

The data is such that for the subject if the value of complain = 1, then they can again be 0 in the next months. So I couldn't use the simple code of deleting rows that have the value of 1 or 2.

Could someone help me do this?

FYI this is how I want my final data to look like (created manually)

dfsaleslong2 <- dfsaleslong[-c(10:14,45:49,67:70,74:77),]

JackDavison · December 20, 2021, 11:22pm

This will do it, using some tidyverse functions to find the lowest row number where a complaint has been made for each subject, and then filtering for the rownumber of the original data to be less than that number.

There may be a less verbose way to achieve this.

library(tidyverse)

dfsales <- data.frame(
  subjectid = c("a","b","c","d","e","f","g","h","i","j","k"),
  location = c("NY","NC","WA","WA","OR","CA","AR","KS","AZ","VT","MA"),
  month1 = c(NA,NA,1,0,0,2,1,1,0,0,0),
  month2 = c(NA,NA,0,0,0,0,NA,0,0,0,NA),
  month3 = c(NA,1,0,1,0,0,0,1,NA,NA,NA),
  month4 = c(0,0,0,0,0,1,2,0,1,NA,0),
  month5 = c(NA,NA,NA,NA,NA,NA,0,1,1,2,0),
  month6 = c(NA,NA,0,0,0,NA,NA,0,0,0,0),
  month7 = c(0,0,0,0,0,0,NA,0,0,0,0),
  goods1 = c(1,2,1,2,0,0,1,2,2,1,0),
  goods2 = c(0,0,1,2,1,1,2,2,1,0,0),
  goods3 = c(0,1,2,1,1,NA,2,1,2,1,NA),
  goods4 = c(0,1,2,1,1,1,2,2,NA,NA,NA),
  goods5 = c(0,1,0,1,1,1,2,2,1,NA,NA),
  goods6 = c(0,1,2,1,1,1,2,2,0,0,0),
  goods7 = c(NA,1,1,1,1,1,2,2,2,NA,NA),
  complain1 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain2 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain3 = c(0,1,0,0,0,0,1,0,0,0,0),
  complain4 = c(0,0,0,0,0,0,0,0,0,1,1),
  complain5 = c(0,0,0,0,0,0,0,0,0,0,0),
  complain6 = c(0,0,0,0,0,0,1,0,0,1,0),
  complain7 = c(0,2,0,0,0,0,2,0,0,0,1))

dfsaleslong <- pivot_longer(data = dfsales,cols = month1:complain7,
                            names_pattern = "([^\\d]+)(\\d+)",
                            names_to = c(".value","month_number"))

find_first_complaint = function(df){
  
  dat = df %>% 
    group_by(subjectid) %>% 
    mutate(n = row_number())
  
  first_complaint = dat %>% 
    mutate(complaint = complain != 0) %>% 
    group_by(subjectid) %>% 
    filter(complaint) %>% 
    filter(n == min(n)) %>% 
    select(subjectid, first_complaint = n)
  
  dat %>% 
    left_join(first_complaint) %>%
    mutate(first_complaint = replace_na(first_complaint, Inf)) %>% 
    filter(n < first_complaint) %>% 
    select(-first_complaint, -n) %>% 
    ungroup()
  
}

find_first_complaint(dfsaleslong)
#> Joining, by = "subjectid"
#> # A tibble: 59 x 6
#>    subjectid location month_number month goods complain
#>    <chr>     <chr>    <chr>        <dbl> <dbl>    <dbl>
#>  1 a         NY       1               NA     1        0
#>  2 a         NY       2               NA     0        0
#>  3 a         NY       3               NA     0        0
#>  4 a         NY       4                0     0        0
#>  5 a         NY       5               NA     0        0
#>  6 a         NY       6               NA     0        0
#>  7 a         NY       7                0    NA        0
#>  8 b         NC       1               NA     2        0
#>  9 b         NC       2               NA     0        0
#> 10 c         WA       1                1     1        0
#> # ... with 49 more rows

^{Created on 2021-12-20 by the reprex package (v2.0.1)}

cactus · December 21, 2021, 2:14pm

Thank you @JackDavison
This did the trick and I got what I needed !
I will try to replicate it with other conditions

system · December 28, 2021, 2:14pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.