Split data set into to 2 'identical' groups

Hi - I am completely new in this forum, nad even to R/R Studio.

I have a "one time" very specific problem. I have a data set containing hundreds of entities. Each entity have 4 values.

I want to split all of my entities into two identical (or as identical as possible) groups. Meaning that the count of entities in Group 1 and Group 2 are as close to each other, while the sum of Value 1, 2, 3 & 4 for each group also are very similar to each other. My data set looks something like this:

I expect this would require some sort of simulation but I have no clue how to 'attack' this problem. If anyone can help me - either in solving the problem or simply directing me to the relevant sources that I need to read and catch up on that would be highly appreciated

If anyone can mentor me on this I'd be happy to pay for the help and to share the result here afterwards!

I remembered I addressed a similar post before:
https://community.rstudio.com/t/is-there-a-r-function-to-help-me-create-5-equally-balanced-lists/71508/5

1 Like

Hi! I tried replacing the iris data set with my own, however I get a lot of things that are not found.

like the object new_iris (in my case, new_data). Object 'best' etc. etc.

Any clue if I miss any libraries not mentioned directly in the source?

There are no libraries required for the solution that arent mentioned in the code. library(tidyverse) is sufficient.

I can't debug your 'version' of the code, without seeing it...
I wonder if its something as simple as you not loading your data into the rsession before using my code on it ?

1 Like

My code is, in its complete is as below - name of my dataset is 'data' and seems to be imported correctly:

Things like: "indexes_to_use" object is not found, 'worst' object is not found etc. etc. - super strange

set.seed(42)
normvar <- function(x) {
  x / sqrt(sum(x^2))
}

data_norm <- data %>%
  mutate_if(is.numeric, normvar) %>%
  mutate(original_index = factor(row_number()))
(global_means <- summarise_if(data_norm, .predicate = is.numeric, mean))
cols <- ncol(global_means)
names(global_means) <- paste0("g", names(global_means))
library(tidyverse)


rand_split_2 <- function(x) {
  group_map(
    .tbl = x %>% mutate(randindex = sample.int(nrow(x), size = nrow(x), replace = FALSE) %% 2) %>% group_by(randindex),
    .f = ~ tibble(.)
  )
}

r1 <- rand_split_2(data_norm)



summarise_a_split <- function(x) {
  group_means <- map_dfr(
    x,
    ~ summarise_if(., .predicate = is.numeric, mean)
  )
  comb <- expand_grid(group_means, global_means)
  # rmse
  column_rmse <- map_dfc(
    names(group_means),
    ~ (comb[[.]] - comb[[paste0("g", .)]])^2
  ) %>%
    summarise_all(mean) %>%
    summarise_all(sqrt)
  ## sum up the column rmse to get a single num per the config
  reduce(column_rmse, .f = `+`)
}

summarise_a_split(r1)

splits_to_do <- 5000
# do x many splits and find the most balanced
splits_x <- map(
  1:splits_to_do,
  ~ rand_split_2(data_norm)
)

# summarise them
summaries_x <- map_dbl(
  splits_x,
  ~ summarise_a_split(.)
)

# find the best (norm)
(best_fit <- which(summaries_x == min(summaries_x)))
summaries_x[(best_fit - 2):(best_fit + 2)]

best <- splits_x[[best_fit]]


# find the worst (norm)
(worst_fit <- which(summaries_x == max(summaries_x)))
summaries_x[worst_fit]

worst <- splits_x[[worst_fit]]

# (norm)
map_dfr(
  best,
  ~ summarise_if(., .predicate = is.numeric, mean)
)

map_dfr(
  worst,
  ~ summarise_if(., .predicate = is.numeric, mean)
)


walk(
  best,
  ~ print(table(pull(., X.U.FEFF.By..geografisk.)))
)




walk(
  worst,
  ~ print(table(pull(., X.U.FEFF.By..geografisk.)))
)


indexes_to_use <- map(
  best,
  ~ pull(., original_index) %>% as.integer()
) %>%
  enframe(value = "original_index", name = "group") %>%
  unnest(cols = c(original_index)) %>%
  arrange(original_index)

new_data <- bind_cols(indexes_to_use, data)
new_data_split <- group_by(new_data, group) %>% group_map(~ tibble(.))

the code to make indexes_to use is here.. if it doesnt exist later then there is probably an error message.

When it comes to addressing error messages, you want to focus on the first one in order of appearances, as following errors are likely caused by earlier errors.
If you want specific help with this, you should at the minimum provide the first error,
but better would be to provide a reprex.

1 Like

Hi! Thank you for your kindness. It took me a bit to figure out reprex, but I think this is what you asked:

set.seed(42)
normvar <- function(x) {
  x / sqrt(sum(x^2))
}

data_norm <- data %>%
  mutate_if(is.numeric, normvar) %>%
  mutate(original_index = factor(row_number()))
#> Error in data %>% mutate_if(is.numeric, normvar) %>% mutate(original_index = factor(row_number())): could not find function "%>%"
(global_means <- summarise_if(data_norm, .predicate = is.numeric, mean))
#> Error in summarise_if(data_norm, .predicate = is.numeric, mean): could not find function "summarise_if"
cols <- ncol(global_means)
#> Error in ncol(global_means): objekt 'global_means' was not found
names(global_means) <- paste0("g", names(global_means))
#> Error in paste0("g", names(global_means)): objekt 'global_means' was not found
library(tidyverse)


rand_split_2 <- function(x) {
  group_map(
    .tbl = x %>% mutate(randindex = sample.int(nrow(x), size = nrow(x), replace = FALSE) %% 2) %>% group_by(randindex),
    .f = ~ tibble(.)
  )
}

r1 <- rand_split_2(data_norm)
#> Error in eval(lhs, parent, parent): objekt 'data_norm' was not found



summarise_a_split <- function(x) {
  group_means <- map_dfr(
    x,
    ~ summarise_if(., .predicate = is.numeric, mean)
  )
  comb <- expand_grid(group_means, global_means)
  # rmse
  column_rmse <- map_dfc(
    names(group_means),
    ~ (comb[[.]] - comb[[paste0("g", .)]])^2
  ) %>%
    summarise_all(mean) %>%
    summarise_all(sqrt)
  ## sum up the column rmse to get a single num per the config
  reduce(column_rmse, .f = `+`)
}

summarise_a_split(r1)
#> Error in map(.x, .f, ...): objekt 'r1' was not found

splits_to_do <- 5000
# do x many splits and find the most balanced
splits_x <- map(
  1:splits_to_do,
  ~ rand_split_2(data_norm)
)
#> Error in eval(lhs, parent, parent): objekt 'data_norm' was not found

# summarise them
summaries_x <- map_dbl(
  splits_x,
  ~ summarise_a_split(.)
)
#> Error in map_dbl(splits_x, ~summarise_a_split(.)): objekt 'splits_x' was not found

# find the best (norm)
(best_fit <- which(summaries_x == min(summaries_x)))
#> Error in which(summaries_x == min(summaries_x)): objekt 'summaries_x' was not found
summaries_x[(best_fit - 2):(best_fit + 2)]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' was not found

best <- splits_x[[best_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' was not found


# find the worst (norm)
(worst_fit <- which(summaries_x == max(summaries_x)))
#> Error in which(summaries_x == max(summaries_x)): objekt 'summaries_x' was not found
summaries_x[worst_fit]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' was not found

worst <- splits_x[[worst_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' was not found

# (norm)
map_dfr(
  best,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'best' was not found

map_dfr(
  worst,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'worst' was not found


walk(
  best,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'best' was not found




walk(
  worst,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'worst' was not found


indexes_to_use <- map(
  best,
  ~ pull(., original_index) %>% as.integer()
) %>%
  enframe(value = "original_index", name = "group") %>%
  unnest(cols = c(original_index)) %>%
  arrange(original_index)
#> Error in map(best, ~pull(., original_index) %>% as.integer()): objekt 'best' was not found

new_data <- bind_cols(indexes_to_use, data)
#> Error in list2(...): objekt 'indexes_to_use' was not found
new_data_split <- group_by(new_data, group) %>% group_map(~ tibble(.))
#> Error in group_by(new_data, group): objekt 'new_data' was not found

I apologise for not being explicit. This forum has a strong tidyverse bias and I'm a member of the cult.
tidyverse includes librarys such as tibble,purrr,dplyr and more.
The pipe %>% is often a dead giveaway that using
library(tidyverse) will provide a good part of needed functions.
In this case it should be the only foreign library you need to load.

1 Like

Moved tidyverse as first library loaded.

Attached new reprex. Im wondering if I lack some basic setup of RStudio?

library(tidyverse)
set.seed(42)
normvar <- function(x) {
  x / sqrt(sum(x^2))
}

data_norm <- data %>%
  mutate_if(is.numeric, normvar) %>%
  mutate(original_index = factor(row_number()))
#> Error in UseMethod("tbl_vars"): no applicable method for 'tbl_vars' applied to an object of class "function"
(global_means <- summarise_if(data_norm, .predicate = is.numeric, mean))
#> Error in tbl_vars_dispatch(x): objekt 'data_norm' blev ikke fundet
cols <- ncol(global_means)
#> Error in ncol(global_means): objekt 'global_means' blev ikke fundet
names(global_means) <- paste0("g", names(global_means))
#> Error in paste0("g", names(global_means)): objekt 'global_means' blev ikke fundet



rand_split_2 <- function(x) {
  group_map(
    .tbl = x %>% mutate(randindex = sample.int(nrow(x), size = nrow(x), replace = FALSE) %% 2) %>% group_by(randindex),
    .f = ~ tibble(.)
  )
}

r1 <- rand_split_2(data_norm)
#> Error in eval(lhs, parent, parent): objekt 'data_norm' blev ikke fundet



summarise_a_split <- function(x) {
  group_means <- map_dfr(
    x,
    ~ summarise_if(., .predicate = is.numeric, mean)
  )
  comb <- expand_grid(group_means, global_means)
  # rmse
  column_rmse <- map_dfc(
    names(group_means),
    ~ (comb[[.]] - comb[[paste0("g", .)]])^2
  ) %>%
    summarise_all(mean) %>%
    summarise_all(sqrt)
  ## sum up the column rmse to get a single num per the config
  reduce(column_rmse, .f = `+`)
}

summarise_a_split(r1)
#> Error in map(.x, .f, ...): objekt 'r1' blev ikke fundet

splits_to_do <- 5000
# do x many splits and find the most balanced
splits_x <- map(
  1:splits_to_do,
  ~ rand_split_2(data_norm)
)
#> Error in eval(lhs, parent, parent): objekt 'data_norm' blev ikke fundet

# summarise them
summaries_x <- map_dbl(
  splits_x,
  ~ summarise_a_split(.)
)
#> Error in map_dbl(splits_x, ~summarise_a_split(.)): objekt 'splits_x' blev ikke fundet

# find the best (norm)
(best_fit <- which(summaries_x == min(summaries_x)))
#> Error in which(summaries_x == min(summaries_x)): objekt 'summaries_x' blev ikke fundet
summaries_x[(best_fit - 2):(best_fit + 2)]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' blev ikke fundet

best <- splits_x[[best_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' blev ikke fundet


# find the worst (norm)
(worst_fit <- which(summaries_x == max(summaries_x)))
#> Error in which(summaries_x == max(summaries_x)): objekt 'summaries_x' blev ikke fundet
summaries_x[worst_fit]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' blev ikke fundet

worst <- splits_x[[worst_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' blev ikke fundet

# (norm)
map_dfr(
  best,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'best' blev ikke fundet

map_dfr(
  worst,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'worst' blev ikke fundet


walk(
  best,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'best' blev ikke fundet




walk(
  worst,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'worst' blev ikke fundet


indexes_to_use <- map(
  best,
  ~ pull(., original_index) %>% as.integer()
) %>%
  enframe(value = "original_index", name = "group") %>%
  unnest(cols = c(original_index)) %>%
  arrange(original_index)
#> Error in map(best, ~pull(., original_index) %>% as.integer()): objekt 'best' blev ikke fundet

new_data <- bind_cols(indexes_to_use, data)
#> Error in list2(...): objekt 'indexes_to_use' blev ikke fundet
new_data_split <- group_by(new_data, group) %>% group_map(~ tibble(.))
#> Error in group_by(new_data, group): objekt 'new_data' blev ikke fundet
Created on 2020-07-31 by the reprex package (v0.3.0)

It seems to me that you simply havent provided your data into the object named data when you run this script.
by default in a fresh r session, the name data points to a function utils::data, and by not overwriting this with your data, you are trying to treat a function as if it was a dataframe when it isnt.

1 Like

Thank you for your patience.

I tried renaming my data set for clarity. Now it is 'mdat'. I also added it to the first line (however I dont think it is necessary?)

As your can see there is an output for 'global means'. Now it fails at 'r1' object.

Getting closer! However, I am truly clueless as of why. Tried googling the error (and the is_grouped_df function) but with no luck :frowning:

mdat <- read.csv2("~/mdat.csv", encoding="UTF-8")
library(tidyverse)
set.seed(42)
normvar <- function(x) {
  x / sqrt(sum(x^2))
}

mdat_norm <- mdat %>%
  mutate_if(is.numeric, normvar) %>%
  mutate(original_index = factor(row_number()))
(global_means <- summarise_if(mdat_norm, .predicate = is.numeric, mean))
#>   Eksponeringer       Klik       Pris Konverteringer
#> 1    0.01743945 0.02045284 0.02129543     0.02212743
cols <- ncol(global_means)
names(global_means) <- paste0("g", names(global_means))



rand_split_2 <- function(x) {
  group_map(
    .tbl = x %>% mutate(randindex = sample.int(nrow(x), size = nrow(x), replace = FALSE) %% 2) %>% group_by(randindex),
    .f = ~ tibble(.)
  )
}

r1 <- rand_split_2(mdat_norm)
#> Error in is_grouped_df(.data): argument ".data" is missing, with no default



summarise_a_split <- function(x) {
  group_means <- map_dfr(
    x,
    ~ summarise_if(., .predicate = is.numeric, mean)
  )
  comb <- expand_grid(group_means, global_means)
  # rmse
  column_rmse <- map_dfc(
    names(group_means),
    ~ (comb[[.]] - comb[[paste0("g", .)]])^2
  ) %>%
    summarise_all(mean) %>%
    summarise_all(sqrt)
  ## sum up the column rmse to get a single num per the config
  reduce(column_rmse, .f = `+`)
}

summarise_a_split(r1)
#> Error in map(.x, .f, ...): objekt 'r1' blev ikke fundet

splits_to_do <- 5000
# do x many splits and find the most balanced
splits_x <- map(
  1:splits_to_do,
  ~ rand_split_2(mdat_norm)
)
#> Error in is_grouped_df(.data): argument ".data" is missing, with no default

# summarise them
summaries_x <- map_dbl(
  splits_x,
  ~ summarise_a_split(.)
)
#> Error in map_dbl(splits_x, ~summarise_a_split(.)): objekt 'splits_x' blev ikke fundet

# find the best (norm)
(best_fit <- which(summaries_x == min(summaries_x)))
#> Error in which(summaries_x == min(summaries_x)): objekt 'summaries_x' blev ikke fundet
summaries_x[(best_fit - 2):(best_fit + 2)]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' blev ikke fundet

best <- splits_x[[best_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' blev ikke fundet


# find the worst (norm)
(worst_fit <- which(summaries_x == max(summaries_x)))
#> Error in which(summaries_x == max(summaries_x)): objekt 'summaries_x' blev ikke fundet
summaries_x[worst_fit]
#> Error in eval(expr, envir, enclos): objekt 'summaries_x' blev ikke fundet

worst <- splits_x[[worst_fit]]
#> Error in eval(expr, envir, enclos): objekt 'splits_x' blev ikke fundet

# (norm)
map_dfr(
  best,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'best' blev ikke fundet

map_dfr(
  worst,
  ~ summarise_if(., .predicate = is.numeric, mean)
)
#> Error in map(.x, .f, ...): objekt 'worst' blev ikke fundet


walk(
  best,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'best' blev ikke fundet




walk(
  worst,
  ~ print(table(pull(., X.U.FEFF.By)))
)
#> Error in map(.x, .f, ...): objekt 'worst' blev ikke fundet


indexes_to_use <- map(
  best,
  ~ pull(., original_index) %>% as.integer()
) %>%
  enframe(value = "original_index", name = "group") %>%
  unnest(cols = c(original_index)) %>%
  arrange(original_index)
#> Error in map(best, ~pull(., original_index) %>% as.integer()): objekt 'best' blev ikke fundet

new_mdat <- bind_cols(indexes_to_use, mdat)
#> Error in list2(...): objekt 'indexes_to_use' blev ikke fundet
new_mdat_split <- group_by(new_mdat, group) %>% group_map(~ tibble(.))
#> Error in group_by(new_mdat, group): objekt 'new_mdat' blev ikke fundet
Created on 2020-07-31 by the reprex package (v0.3.0)

this failed because it was designed for dplyr v 0.8(something) but since dplyr v1.0 .tbl is now .data
so this amendment should work!

rand_split_2 <- function(x) {
  group_map(
    .data= x %>% mutate(randindex = sample.int(nrow(x), size = nrow(x), replace = FALSE) %% 2) %>% group_by(randindex),
    .f = ~ tibble(.)
  )
}
1 Like

terrific - it works!

Final question - how to export data as it looks in the viewer?

you want to export as data to some other program, or simply to present nicely for people to read ?

1 Like

presented nicely to read in excel :slight_smile:

similar to the viewer:

When i c/p I get:

you can save a dataframe to csv with write.csv()
then can open in excel etc.
or can use DT library

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.