Removing rows based on closest matched value in data frame

I have the following filtering problem using the example data below:
For every negative x, remove that row and the nearest row above it with matching y value.

d <- structure(list(
  x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1),
  y = c(84, 28, 0, 112, 28, 112, 112, 28, 28, 112, 28, 28, 
        112, 28, 112, 112, 28, 28, 112, 112, 28, 28, 112, 112, 28, 112, 
        28, 28, 112, 28, 112, 28, 112, 112, 28)
), class = "data.frame", row.names = c(NA, -35L))

d$exclude <- FALSE
d$exclude[c(10, 13, 32:35)] <- TRUE
print(d)
#>     x   y exclude
#> 1   1  84   FALSE
#> 2   1  28   FALSE
#> 3   1   0   FALSE
#> 4   1 112   FALSE
#> 5   1  28   FALSE
#> 6   1 112   FALSE
#> 7   1 112   FALSE
#> 8   1  28   FALSE
#> 9   1  28   FALSE
#> 10  1 112    TRUE
#> 11  1  28   FALSE
#> 12  1  28   FALSE
#> 13 -1 112    TRUE
#> 14  1  28   FALSE
#> 15  1 112   FALSE
#> 16  1 112   FALSE
#> 17  1  28   FALSE
#> 18  1  28   FALSE
#> 19  1 112   FALSE
#> 20  1 112   FALSE
#> 21  1  28   FALSE
#> 22  1  28   FALSE
#> 23  1 112   FALSE
#> 24  1 112   FALSE
#> 25  1  28   FALSE
#> 26  1 112   FALSE
#> 27  1  28   FALSE
#> 28  1  28   FALSE
#> 29  1 112   FALSE
#> 30  1  28   FALSE
#> 31  1 112   FALSE
#> 32  1  28    TRUE
#> 33  1 112    TRUE
#> 34 -1 112    TRUE
#> 35 -1  28    TRUE

Created on 2020-02-14 by the reprex package (v0.3.0)

In example data d, that corresponds to removing rows 10, 13, 32, 33, 34, 35. My question is, is there a way to programatically create the exclude column? I've been looking around and trying to use combinations of helper functions such as Position(), which.min(), findInterval() to no avail. Any suggestions would be greatly appreciated!

thanks for the puzzle, I have some doubts about deleting the nearest previous "y matching " row for negative x, as I'm not sure if it should matter the x value of that candidate for deletion.. but my implementation at least 'works' when judged by the test data, and your expectation of rows it deletes.

Note that I could have done a simple for loop over each negative x that I wanted to find matches for, but I got cute and used the slider package which I was recently introduced to.

library(tidyverse)
library(slider)
 <- structure(list(
  x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1),
  y = c(84, 28, 0, 112, 28, 112, 112, 28, 28, 112, 28, 28, 
        112, 28, 112, 112, 28, 28, 112, 112, 28, 28, 112, 112, 28, 112, 
        28, 28, 112, 28, 112, 28, 112, 112, 28)
), class = "data.frame", row.names = c(NA, -35L))

# step 1, harden up the rownumbers
d2 <- d %>% mutate(rownum = row_number())


 # step 2, extract the negative x's
 neg_x <- d2 %>% filter(x<0)

 # step 3, want search for the nearest row above with matching y
  # first lookup for each -ex the dataframe containing all rows before it that match its y value
 searches_to_perform <- slide(.x = neg_x,
       .f = ~ filter(slice(d2,1:.[["rownum"]]),y==.[["y"]]),
       )
 
 #step 4, the 2nd to last entry if it exists is what we want to mark for deletion
 # do we need an additional rule though, to insist that x == 1 for example ? in the test data x isnt not 1 , does this matter ?
 
 found_em <- map(searches_to_perform,~(tail(.,2)) %>% slice(1))
 
 #step 5 , make a list of rows to delete
 
 rows_to_del <-  c(bind_rows(found_em) %>% pull(rownum),pull(neg_x,rownum))

 d2$rows_to_del <-  d2$rownum %in% rows_to_del
 
 # do the delete
 d3 <- filter(d2, rows_to_del==FALSE) %>% select(x,y)

Hi @nirgrahamuk thanks for the suggestion, I was also fiddling with dplyr::row_number() and came up with the following solution. Do you see ways it can be improved? Yeah, the negative x can be -1, -2, etc, it's only an indicator.

d <- structure(list(
  x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1),
  y = c(84, 28, 0, 112, 28, 112, 112, 28, 28, 112, 28, 28, 
        112, 28, 112, 112, 28, 28, 112, 112, 28, 28, 112, 112, 28, 112, 
        28, 28, 112, 28, 112, 28, 112, 112, 28)
), class = "data.frame", row.names = c(NA, -35L))

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
d2 <- d %>%
  mutate(z = row_number(y),
         exclude = case_when(z %in% c(z[x < 0], z[x < 0] - 1) ~ TRUE,
                             TRUE ~ FALSE))
print(d2)
#>     x   y  z exclude
#> 1   1  84 19   FALSE
#> 2   1  28  2   FALSE
#> 3   1   0  1   FALSE
#> 4   1 112 20   FALSE
#> 5   1  28  3   FALSE
#> 6   1 112 21   FALSE
#> 7   1 112 22   FALSE
#> 8   1  28  4   FALSE
#> 9   1  28  5   FALSE
#> 10  1 112 23    TRUE
#> 11  1  28  6   FALSE
#> 12  1  28  7   FALSE
#> 13 -1 112 24    TRUE
#> 14  1  28  8   FALSE
#> 15  1 112 25   FALSE
#> 16  1 112 26   FALSE
#> 17  1  28  9   FALSE
#> 18  1  28 10   FALSE
#> 19  1 112 27   FALSE
#> 20  1 112 28   FALSE
#> 21  1  28 11   FALSE
#> 22  1  28 12   FALSE
#> 23  1 112 29   FALSE
#> 24  1 112 30   FALSE
#> 25  1  28 13   FALSE
#> 26  1 112 31   FALSE
#> 27  1  28 14   FALSE
#> 28  1  28 15   FALSE
#> 29  1 112 32   FALSE
#> 30  1  28 16   FALSE
#> 31  1 112 33   FALSE
#> 32  1  28 17    TRUE
#> 33  1 112 34    TRUE
#> 34 -1 112 35    TRUE
#> 35 -1  28 18    TRUE
d2 %>% filter(exclude)
#>    x   y  z exclude
#> 1  1 112 23    TRUE
#> 2 -1 112 24    TRUE
#> 3  1  28 17    TRUE
#> 4  1 112 34    TRUE
#> 5 -1 112 35    TRUE
#> 6 -1  28 18    TRUE

Created on 2020-02-14 by the reprex package (v0.3.0)

wow ! your solution is impressively succint !

Another filtering criterion, what if I want to remove both the closest previous positive value and the negative value it offsets to 0 within the same vector? Example:

v <- c(4, 4, 4, 3.75, 4, 4, 3.7, 3.7, 4, 3.7, 3.7, 4, 3.6, 3.6, 3.5, 
       3.5, 3.6, -3.6, 3.5, -3.5)

Here the last 4 elements would be removed. Any ideas?

sorry, is this a standalone challenge, or does it relate to the previous dataframe?

It is related as the column y actually has additional criteria to satisfy that I did not inform initially.

ok, the x,y example we started with is 35 rows, do i need 35 rows of v to look at this ?

Take this for example, parts of d look like this:

d <- structure(list(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                          1, 1, 1, 1, 1, 1), y = c(4, 4, 4, 3.75, 4, 4, 3.7, 3.7, 4, 3.7, 
                                                   3.7, 4, 3.6, 3.6, 3.5, 3.5, 3.6, -3.6, 3.5, -3.5)), row.names = c(NA, 
                                                                                                                     -20L), class = "data.frame")

Need rows 17 to 20 removed

ok, I lack context for this. it sounds like you want a negative value in y to be deleted if we can find the same value (but positive) earlier in y.

Are we considering this an independent pass through the data. like the first manipulation you found a solution for, you would do first, and then you would do this after ?

Yes so row 18 is removed because it's closest previous offset is row 17, and same logic for row 20 and 19.

I think it's an independent pass through the data that can happen after the first manipulation.

library(tidyverse)
library(slider)
 
 
 d <- structure(list(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                           1, 1, 1, 1, 1, 1), y = c(4, 4, 4, 3.75, 4, 4, 3.7, 3.7, 4, 3.7, 
                                                    3.7, 4, 3.6, 3.6, 3.5, 3.5, 3.6, -3.6, 3.5, -3.5)), row.names = c(NA, 
                                                                                                                      -20L), class = "data.frame")
 
 # step 1, harden up the rownumbers
 d2 <- d %>% mutate(rownum = row_number())
 
#neg_y instead of neg_x
 neg_y <- d2 %>% filter(y<0)

 #use abs to match though signs expected opposite and add y>0
 searches_to_perform <- slide(.x = neg_y,
                              .f = ~ filter(slice(d2,1:.[["rownum"]]),y>0 & y==abs(.[["y"]]))
 )
 #tail changed from 1 to 2
 found_em <- map(searches_to_perform,~(tail(.,1)) %>% slice(1))
 
 
 rows_to_del <-  c(bind_rows(found_em) %>% pull(rownum),pull(neg_y,rownum))
 
 d2$rows_to_del <-  d2$rownum %in% rows_to_del
 
 # do the delete
 d3 <- filter(d2, rows_to_del==FALSE) %>% select(x,y)

I know this is a failure to be succint, but I achieved this altered 2nd pass approach with only minimal alterations to the first pass we did, so I thought its a good trade off in terms of lazyiness to result :smiley:

1 Like