library(tidyverse)
#make up data
set.seed(42)
(exdf <- tibble(
prods=c(sample(letters[1:26],size=100,replace = TRUE),
sample(letters[c(4,5,12,25,26)],size=100,replace = TRUE)),
s1 = sample.int(100,size=200,replace=TRUE),
s2 = sample.int(100,size=200,replace=TRUE),
s3 = sample.int(100,size=200,replace=TRUE),
s4 = sample.int(100,size=200,replace=TRUE)
))
# detect the 5 most common
(top_5_prods_df <- count(exdf,prods) %>% arrange(desc(n)) %>% head(n=5))
# go back to original data and only keep the rows for these 5
(result_df <- filter(exdf,
prods %in% top_5_prods_df$prods))
#check
table(result_df$prods)