filter column by count

There might be an obvious (and easy) solution to this, but here goes:
How could I filter this dataframe to only return the x-y pairs that match in column x? In other words, remove the last two rows.

library(dplyr, warn.conflicts = FALSE)

Dfab <- bind_rows(tibble(x = seq(1:5), y = rep("a", 5)), tibble(x = seq(1:7), y = rep("b", 7)))

Created on 2018-12-12 by the reprex package (v0.2.1)

I'm confused by what you mean by "match in column x." Is it that you want only values for which there is an x for each y? Like, you only want pairs for which there is both a #a and a #b (where # represents the value in x)? (Adding a reprex with the data printed for ease of browsing).

library(dplyr, warn.conflicts = FALSE)

Dfab <- bind_rows(tibble(x = seq(1:5), 
                         y = rep("a", 5)), 
                  tibble(x = seq(1:7), 
                         y = rep("b", 7)))
Dfab
#> # A tibble: 12 x 2
#>        x y    
#>    <int> <chr>
#>  1     1 a    
#>  2     2 a    
#>  3     3 a    
#>  4     4 a    
#>  5     5 a    
#>  6     1 b    
#>  7     2 b    
#>  8     3 b    
#>  9     4 b    
#> 10     5 b    
#> 11     6 b    
#> 12     7 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)

For example, in the case below, would you not want the row where x = 3 and y = a, since you don't have x = 3 and y = b?

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
  )

Dfab2
#> # A tibble: 11 x 2
#>        x y    
#>    <int> <chr>
#>  1     1 a    
#>  2     2 a    
#>  3     3 a    
#>  4     4 a    
#>  5     5 a    
#>  6     1 b    
#>  7     2 b    
#>  8     4 b    
#>  9     5 b    
#> 10     6 b    
#> 11     7 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)

1 Like

Me too :slight_smile:

Spot on, nor would I want 6b or 7b since there is 6a or 7a.

I'm not sure if this makes it worse or better, but I guess y could be ignored: I'd like to filter x so distinct(x) appears twice and keep only those rows.

OK, I'm positively sure this is not the most parsimonious code for this, but it works

suppressPackageStartupMessages(library(tidyverse))

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
  )

two_times <- Dfab2 %>%
  count(x) %>%
  filter(n == 2)

Dfab2 %>%
  filter(x %in% two_times$x)
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)

1 Like

Very similar approach:

library(magrittr)
Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) %>%
  dplyr::semi_join(dplyr::group_by(., x) %>%
                     dplyr::summarise(n = n()) %>%
                     dplyr::filter(n > 1),
                   by = "x"
  )

Dfab2
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b

Created on 2018-12-12 by the reprex package (v0.2.1)

2 Likes

This is better, since you don't create an intermediary table! Argh— my brain really refuses to behave this morning.

Excellent, thank you both! It's waaay passed lunch here, can't even find my brain.

Got it even shorter with count():

library(dplyr)

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) %>%
  semi_join(count(.,x) %>%
              filter(n > 1),
            by = "x"
  )

Dfab2
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b
1 Like

Oh wait.

library(dplyr)

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) 

Dfab2 %>% 
  add_count(x) %>% 
  filter(n > 1)
#> # A tibble: 8 x 3
#>       x y         n
#>   <int> <chr> <int>
#> 1     1 a         2
#> 2     2 a         2
#> 3     4 a         2
#> 4     5 a         2
#> 5     1 b         2
#> 6     2 b         2
#> 7     4 b         2
#> 8     5 b         2

Created on 2018-12-12 by the reprex package (v0.2.1)

3 Likes

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.