I was asking about the filters. You want to filter on both rows and columns, and my question was whether those two need to match or not.
Here is a very naive attempt to answer the question. I'm sure it can be improved a lot, I'll look forward to that solution by you or someone else.
# data setup
set.seed(123)
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
id <- 1:1000
my_data <- data.frame(id, num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
# utility functions
generate_function_to_get_filter_condition <- function(filter_threshold, filter_type = c("<", ">", "<=", ">=", "==", "!=", "in")) {
filter_type <- match.arg(filter_type)
filter_threshold_as_string <- deparse1(filter_threshold)
get_filter_condition <- function(column_name) {
switch(filter_type,
"<" = paste0(column_name, " < ", filter_threshold_as_string),
">" = paste0(column_name, " > ", filter_threshold_as_string),
"<=" = paste0(column_name, " <= ", filter_threshold_as_string),
">=" = paste0(column_name, " >= ", filter_threshold_as_string),
"==" = paste0(column_name, " == ", filter_threshold_as_string),
"!=" = paste0(column_name, " != ", filter_threshold_as_string),
"in" = paste0(column_name, " %in% ", filter_threshold_as_string)
)
}
return(get_filter_condition)
}
generate_nominal_factor_column_condition <- function(factor_levels) {
number_of_levels_to_keep <- sample.int(length(factor_levels), 1)
levels_to_keep <- sample(factor_levels, number_of_levels_to_keep, FALSE)
return(generate_function_to_get_filter_condition(levels_to_keep, "in"))
}
generate_ordinal_factor_column_condition <- function(factor_levels) {
condition_type <- sample(c("<", ">", "<=", ">=", "==", "!=", "in"), 1)
if(condition_type == "<") {
condition_level <- sample(tail(factor_levels, -1), 1)
} else if(condition_type == ">") {
condition_level <- sample(head(factor_levels, -1), 1)
} else if(condition_type == "in") {
number_of_levels <- sample(length(factor_levels), 1)
condition_level <- sample(factor_levels, number_of_levels, FALSE)
} else {
condition_level <- sample(factor_levels, 1)
}
return(generate_function_to_get_filter_condition(condition_level, condition_type))
}
generate_factor_column_condition <- function(factor_column_values) {
factor_levels <- levels(factor_column_values)
if(is.ordered(factor_column_values)) {
return(generate_ordinal_factor_column_condition(factor_levels))
}
return(generate_nominal_factor_column_condition(factor_levels))
}
generate_numeric_column_condition <- function(numeric_column_values) {
condition_type <- sample(c("<", ">", "<=", ">="), 1)
condition_cutoff <- runif(1, min(numeric_column_values) + .Machine$double.eps, max(numeric_column_values) - .Machine$double.eps)
return(generate_function_to_get_filter_condition(condition_cutoff, condition_type))
}
generate_column_condition <- function(column_name, column_values) {
if(is.factor(column_values)) {
return(generate_factor_column_condition(column_values)(column_name))
}
return(generate_numeric_column_condition(column_values)(column_name))
}
# main functions
generate_data_conditions <- function(dataset) {
columns <- names(dataset)
number_of_columns_to_keep <- sample.int(length(columns), 1)
columns_to_keep <- sample(columns, number_of_columns_to_keep, FALSE)
number_of_columns_to_filter <- sample.int(length(columns), 1)
columns_to_filter <- sample(columns, number_of_columns_to_filter, FALSE)
filter_conditions <- vapply(columns_to_filter, \(filter_column) generate_column_condition(filter_column, dataset[[filter_column]]), character(1), USE.NAMES = FALSE)
return(c(subset_columns=paste0(columns_to_keep, collapse = " , "), subset_conditions=paste0(filter_conditions, collapse = " , ")))
}
get_subset_details <- function(dataset, number_of_replications) {
results <- replicate(number_of_replications, generate_data_conditions(dataset), simplify = TRUE)
return(as.data.frame(t(results)))
}
# time demonstration
tic <- Sys.time()
my_data_result_time <- as.data.frame(t(replicate(100, generate_data_conditions(my_data))))
toc <- Sys.time()
print(toc - tic)
# result demonstration
my_data_result_small <- as.data.frame(t(replicate(2, generate_data_conditions(my_data))))
print((my_data_result_small))
Hope this helps.