This doesn't really work but it sort of illustrates the approach I outlined above.
I stress that I am complete novice here and it's quite likely that there's a cleverer approach or a sound statistical method that could be applied, that I am simply not aware of. I'm also just starting to experiment with the new tools that dplyr
1.0.0 gives us, so the syntax below with some of these might be a mess.
Anyway here's my code, which might be made to work if I knew how to make recursion work properly, It just uses your example data above.
library(tibble)
library(purrr)
words_dataset <- tibble(
word = c("ceiling", "dancer", "prince", "medicine"),
concreteness = c(606, 558, 542, 517),
length = c(7, 6, 6, 8),
log_frequency = c(8.516, 8.004, 9.31, 9.893)
)
# calculate averages of the initial, whole dataset:
# we want our final groups to have similar averages
avg <- words_dataset %>%
summarise(across(where(is.numeric), ~ mean(., na.rm = TRUE)))
select_starters <- function(df, n) { # n is number of groups desired
# choose some random rows to slice
row_numbers <- sample(1:nrow(df), n)
# create a list of a list of the word groups plus the remaining words
list(
word_groups = map(row_numbers, ~ slice(df, .)),
remainder = slice(df, -row_numbers)
)
}
grow_groups <- function(df1, df2, avgs) {
# for a group of words, calculate their current average for each variable
group_avg <- df1 %>%
summarise(across(where(is.numeric), ~ mean(., na.rm = TRUE)))
# calculate what the ideal counterweight average would be
# that would make the avg of the new group near to the initial dataset avg
target_avg <- tibble(
avgs = unlist(avgs),
group_avg = unlist(group_avg)
) %>%
transmute(target_avg = (avgs*2) - group_avg) %>%
unlist()
# calculate a score for each word in the remainder data
# this score represents how close each word is overall to the target average
df2 <- df2 %>%
mutate(
concreteness_score = abs(concreteness - target_avg[["concreteness"]]),
length_score = abs(length - target_avg[["length"]]),
log_frequency_score = abs(log_frequency - target_avg[["log_frequency"]])) %>%
rowwise() %>%
mutate(
score_total = sum(c_across(ends_with("score")))) %>%
# sort by score so that the word with the smallest score (closest to
# target average) is at the top
arrange(score_total) %>%
select(c(word, concreteness, length, log_frequency))
# add the top row of the remainder to the word group
df1 <- bind_rows(df1, head(df2, 1))
# remove the top row, leaving the remainder
if(nrow(df2) > 1) {
df2 <- tail(df2, -1)
}
else {
df2 <- NULL
}
# return the resulting list (new word group plus new remainder)
list(df1, df2)
}
# create list of random initial seed words
initial_groups <- select_starters(words_dataset, 2)
choose_your_partners <- function(df_list) {
if(!length(df_list) == 2) {
stop("List must have length 2")
}
# for each word group, add a new word
new_list <- df_list[[1]] %>%
map( ~ grow_groups(., df2 = df_list[[2]], avgs = avg))
}
# problem with not knowing how to do recursion properly
# this **doesn't work** because the new (reduced) remainder doesn't get fed back in;
# instead the previous remainder data is re-used,
# which means that a word can be added to more than one group (words ought to be
# permanently removed from the remainder set immediately after being selected)
round1 <- choose_your_partners(df_list = initial_groups)
round2 <- choose_your_partners(df_list = round1)