filter column by count

dplyr

#1

There might be an obvious (and easy) solution to this, but here goes:
How could I filter this dataframe to only return the x-y pairs that match in column x? In other words, remove the last two rows.

library(dplyr, warn.conflicts = FALSE)

Dfab <- bind_rows(tibble(x = seq(1:5), y = rep("a", 5)), tibble(x = seq(1:7), y = rep("b", 7)))

Created on 2018-12-12 by the reprex package (v0.2.1)


#2

I'm confused by what you mean by "match in column x." Is it that you want only values for which there is an x for each y? Like, you only want pairs for which there is both a #a and a #b (where # represents the value in x)? (Adding a reprex with the data printed for ease of browsing).

library(dplyr, warn.conflicts = FALSE)

Dfab <- bind_rows(tibble(x = seq(1:5), 
                         y = rep("a", 5)), 
                  tibble(x = seq(1:7), 
                         y = rep("b", 7)))
Dfab
#> # A tibble: 12 x 2
#>        x y    
#>    <int> <chr>
#>  1     1 a    
#>  2     2 a    
#>  3     3 a    
#>  4     4 a    
#>  5     5 a    
#>  6     1 b    
#>  7     2 b    
#>  8     3 b    
#>  9     4 b    
#> 10     5 b    
#> 11     6 b    
#> 12     7 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)

For example, in the case below, would you not want the row where x = 3 and y = a, since you don't have x = 3 and y = b?

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
  )

Dfab2
#> # A tibble: 11 x 2
#>        x y    
#>    <int> <chr>
#>  1     1 a    
#>  2     2 a    
#>  3     3 a    
#>  4     4 a    
#>  5     5 a    
#>  6     1 b    
#>  7     2 b    
#>  8     4 b    
#>  9     5 b    
#> 10     6 b    
#> 11     7 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)


#3

Me too :slight_smile:

Spot on, nor would I want 6b or 7b since there is 6a or 7a.


#4

I'm not sure if this makes it worse or better, but I guess y could be ignored: I'd like to filter x so distinct(x) appears twice and keep only those rows.


#5

OK, I'm positively sure this is not the most parsimonious code for this, but it works

suppressPackageStartupMessages(library(tidyverse))

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
  )

two_times <- Dfab2 %>%
  count(x) %>%
  filter(n == 2)

Dfab2 %>%
  filter(x %in% two_times$x)
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b

Created on 2018-12-12 by the reprex package (v0.2.1.9000)


#6

Very similar approach:

library(magrittr)
Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) %>%
  dplyr::semi_join(dplyr::group_by(., x) %>%
                     dplyr::summarise(n = n()) %>%
                     dplyr::filter(n > 1),
                   by = "x"
  )

Dfab2
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b

Created on 2018-12-12 by the reprex package (v0.2.1)


#7

This is better, since you don't create an intermediary table! Argh— my brain really refuses to behave this morning.


#8

Excellent, thank you both! It's waaay passed lunch here, can't even find my brain.


#9

Got it even shorter with count():

library(dplyr)

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) %>%
  semi_join(count(.,x) %>%
              filter(n > 1),
            by = "x"
  )

Dfab2
#> # A tibble: 8 x 2
#>       x y    
#>   <int> <chr>
#> 1     1 a    
#> 2     2 a    
#> 3     4 a    
#> 4     5 a    
#> 5     1 b    
#> 6     2 b    
#> 7     4 b    
#> 8     5 b

"Missing" workflow in tidyverse?
#10

Oh wait.

library(dplyr)

Dfab2 <- tibble::tribble(
  ~x,  ~y,
  1L, "a",
  2L, "a",
  3L, "a",
  4L, "a",
  5L, "a",
  1L, "b",
  2L, "b",
  4L, "b",
  5L, "b",
  6L, "b",
  7L, "b"
) 

Dfab2 %>% 
  add_count(x) %>% 
  filter(n > 1)
#> # A tibble: 8 x 3
#>       x y         n
#>   <int> <chr> <int>
#> 1     1 a         2
#> 2     2 a         2
#> 3     4 a         2
#> 4     5 a         2
#> 5     1 b         2
#> 6     2 b         2
#> 7     4 b         2
#> 8     5 b         2

Created on 2018-12-12 by the reprex package (v0.2.1)


#11

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.