I generated some random data and wrote the following code which loops through a series of data manipulation steps and produces a table containing the results ("final_results"):
#code1 #load library library(dplyr) library(data.table) set.seed(123) # create some data for this example a1 = rnorm(1000,100,10) b1 = rnorm(1000,100,5) c1 = sample.int(1000, 1000, replace = TRUE) train_data = data.frame(a1,b1,c1) #### results_table <- data.frame() for (i in 1:10 ) { #generate random numbers random_1 = runif(1, 80, 120) random_2 = runif(1, random_1, 120) random_3 = runif(1, 85, 120) random_4 = runif(1, random_3, 120) #bin data according to random criteria train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c"))) train_data$cat = as.factor(train_data$cat) #new splits a_table = train_data %>% filter(cat == "a") %>% select(a1, b1, c1, cat) b_table = train_data %>% filter(cat == "b") %>% select(a1, b1, c1, cat) c_table = train_data %>% filter(cat == "c") %>% select(a1, b1, c1, cat) split_1 = runif(1,0, 1) split_2 = runif(1, 0, 1) split_3 = runif(1, 0, 1) #calculate 60th quantile ("quant") for each bin table_a = data.frame(a_table%>% group_by(cat) %>% mutate(quant = quantile(c1, prob = split_1))) table_b = data.frame(b_table%>% group_by(cat) %>% mutate(quant = quantile(c1, prob = split_2))) table_c = data.frame(c_table%>% group_by(cat) %>% mutate(quant = quantile(c1, prob = split_3))) #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1" table_a$diff = ifelse(table_a$quant > table_a$c1,1,0) table_b$diff = ifelse(table_b$quant > table_b$c1,1,0) table_c$diff = ifelse(table_c$quant > table_c$c1,1,0) #group all tables final_table = rbind(table_a, table_b, table_c) #create a table: for each bin, calculate the average of "diff" final_table_2 = data.frame(final_table %>% group_by(cat) %>% summarize( mean = mean(diff) )) #add "total mean" to this table final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff))) #format this table: add the random criteria to this table for reference final_table_2$random_1 = random_1 final_table_2$random_2 = random_2 final_table_2$random_3 = random_3 final_table_2$random_4 = random_4 final_table_2$split_1 = split_1 final_table_2$split_2 = split_2 final_table_2$split_3 = split_3 final_table_2$iteration_number = i results_table <- rbind(results_table, final_table_2) final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean') } #view output final_results
Problem : Now, I am trying to re-write the loop so that over the course of the loop:
- for each unique iteration:
- the "results_table" only keeps the rows corresponding to those iterations with the 5 largest value of "total mean"
E.g. https://i.stack.imgur.com/EyUX3.png
Goal: For the "results_table" shown above, each "black box" represents a set of iterations, and each "red box" shows the value of the "total mean". To prevent the "results_table" from growing in size with each iteration, I would only like to keep rows corresponding to the 5 biggest values (in this example indicated with the red boxes).
I tried to combine "results_table" and "final_table" into a single step using the following code:
#code2
#load library
library(dplyr)
library(data.table)
set.seed(123)
# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
####
results_table <- data.frame()
for (i in 1:10 ) {
#generate random numbers
random_1 = runif(1, 80, 120)
random_2 = runif(1, random_1, 120)
random_3 = runif(1, 85, 120)
random_4 = runif(1, random_3, 120)
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
split_1 = runif(1,0, 1)
split_2 = runif(1, 0, 1)
split_3 = runif(1, 0, 1)
#calculate 60th quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
results_table = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
#keep the 5 biggest results (according to the "total" variable)
results_table <- head(results_table[order(-total)], 5)
}
#view output (should only have 5 rows)
results_table
But this results in an error and outputs a table with only one row:
Error in rbindlist(l, use.names, fill, idcol) :
Item 2 has 10 columns, inconsistent with item 1 which has 12 columns. To fill missing columns use fill=TRUE.
#view output (should only have 5 rows)
results_table
iteration_number random_1 random_2 random_3 random_4 split_1 split_2 split_3 a b c total
1: 1 95.67371 111.8133 94.00313 102.0569 0.8404564 0.6882731 0.7749321 0.8205128 0.6870229 0.7734554 0.73
Can someone please show me how to fix this problem? Why are these two codes producing different outputs?
Thanks