Drop the specific files in the filelist by conditions

I have time series data as follow:

L1_0101_0601_A L1_0101_0603_A L1_0101_0605_B L1_0101_0608_B L1_0101_0610_C L1_0101_0612_C L1_0101_0615_A L1_0101_0617_A L1_0101_0619_A L1_0101_0622_B L1_0101_0624_B L1_0101_0626_C L1_0101_0629_C L1_0101_0631_A L1_0101_0633_A L1_0101_0635_B L1_0101_0638_B L1_0101_0640_C L1_0101_0642_C L1_0101_0645_A L1_0101_0647_A L1_0101_0649_A and so on.

library(tidyverse)

filelist <- c("L1_0101_0601_A", "L1_0101_0603_A", "L1_0101_0605_B", 
"L1_0101_0608_B", "L1_0101_0610_C", "L1_0101_0612_C",
"L1_0101_0615_A", "L1_0101_0617_A", "L1_0101_0619_A",
"L1_0101_0622_B", "L1_0101_0624_B", "L1_0101_0626_C",
"L1_0101_0629_C")

I want to keep the files conditionally. For example, in the above filelist, I want to keep only two continuous files ending A, B and C. In the filelist, sometimes, there are three continuous files ending A or B or C. So, I want to drop "L1_0101_0619_A" and "L1_0101_0649_A" because it is the third files ending A. I will keep other twos. Like this, there are several files which I would like to remove.

Anyone can help me?

This solution is strictly constrained by the data provided in which no more than three continuous files ending in the same letter occur.

filelist <- c("L1_0101_0601_A", "L1_0101_0603_A", "L1_0101_0605_B", 
              "L1_0101_0608_B", "L1_0101_0610_C", "L1_0101_0612_C",
              "L1_0101_0615_A", "L1_0101_0617_A", "L1_0101_0619_A",
              "L1_0101_0622_B", "L1_0101_0624_B", "L1_0101_0626_C",
              "L1_0101_0629_C")

pat <- "^L1_\\d{4}_\\d{4}_"

# only last character is used
(shortlist <- gsub(pat,"",filelist))
#>  [1] "A" "A" "B" "B" "C" "C" "A" "A" "A" "B" "B" "C" "C"

# find the index locations of sequential occurrences
(runs <- rle(shortlist))
#> Run Length Encoding
#>   lengths: int [1:6] 2 2 2 3 2 2
#>   values : chr [1:6] "A" "B" "C" "A" "B" "C"

# find the indexes locations of the third occurrence
(drops <- cumsum(runs$lengths)[which(runs$lengths > 2)])
#> [1] 9

# use this to subset out excess occurrence
(pruned <- filelist[-9])
#>  [1] "L1_0101_0601_A" "L1_0101_0603_A" "L1_0101_0605_B" "L1_0101_0608_B"
#>  [5] "L1_0101_0610_C" "L1_0101_0612_C" "L1_0101_0615_A" "L1_0101_0617_A"
#>  [9] "L1_0101_0622_B" "L1_0101_0624_B" "L1_0101_0626_C" "L1_0101_0629_C"

# wrap in a function

trim_to_two <- function(x) {
  runs = rle(gsub("^L1_\\d{4}_\\d{4}_","",x))
  return(cumsum(runs$lengths)[which(runs$lengths > 2)] * -1)
  }

(pruned <- filelist[trim_to_two(filelist)])
#>  [1] "L1_0101_0601_A" "L1_0101_0603_A" "L1_0101_0605_B" "L1_0101_0608_B"
#>  [5] "L1_0101_0610_C" "L1_0101_0612_C" "L1_0101_0615_A" "L1_0101_0617_A"
#>  [9] "L1_0101_0622_B" "L1_0101_0624_B" "L1_0101_0626_C" "L1_0101_0629_C"

# modify input to introduce a second repetition of three

filelist <- c("L1_0101_0601_A", "L1_0101_0603_A", "L1_0101_0605_A", 
              "L1_0101_0608_B", "L1_0101_0610_C", "L1_0101_0612_C",
              "L1_0101_0615_A", "L1_0101_0617_A", "L1_0101_0619_A",
              "L1_0101_0622_B", "L1_0101_0624_B", "L1_0101_0626_C",
              "L1_0101_0629_C")

(pruned <- filelist[trim_to_two(filelist)])
#>  [1] "L1_0101_0601_A" "L1_0101_0603_A" "L1_0101_0608_B" "L1_0101_0610_C"
#>  [5] "L1_0101_0612_C" "L1_0101_0615_A" "L1_0101_0617_A" "L1_0101_0622_B"
#>  [9] "L1_0101_0624_B" "L1_0101_0626_C" "L1_0101_0629_C"

Created on 2023-01-16 with reprex v2.0.2

1 Like

@technocrat Thank you so much! It works perfectly.

1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.