Hi naja,
Would this work to randomly sample unique contacts to create nrow ~ 3000 ?
library(tidyverse)
# make a toy data frame
df <- data.frame(
contact = c(1800, 1800, 1840, 1840,1840, 1840, 1865, 1865, 1890),
payment = c(21, 43, 35, 43, 42, 56, 12, 17, 29)
)
# estimate how many contacts are needed to ~ 3000
df %>%
count(contact) %>%
summarise(num_contacts = 3000/mean(n, na.rm = TRUE)) %>%
round() %>%
as.integer()->
num_contacts
# identify unique contacts, then randomly sample num_contacts of them
# You may not want replace = TRUE
df$contact %>%
unique() %>%
sample(size = num_contacts, replace = TRUE) %>% as.data.frame() ->
contacts
names(contacts)<- 'contact' #fix column name
# then use this vector of contacts in a semi_join to select the rows from the dataframe with these contact numbers.
semi_join(df, contacts)
#> Joining, by = "contact"
#> contact payment
#> 1 1800 21
#> 2 1800 43
#> 3 1840 35
#> 4 1840 43
#> 5 1840 42
#> 6 1840 56
#> 7 1865 12
#> 8 1865 17
#> 9 1890 29
Created on 2019-11-15 by the reprex package (v0.3.0)