# Can someone please explain the difference between these two codes?

I generated some random data and wrote the following code which loops through a series of data manipulation steps and produces a table containing the results ("final_results"):

``````#code1
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

####
results_table <- data.frame()

for (i in 1:10 ) {

#generate random numbers
random_1 =  runif(1, 80, 120)
random_2 =  runif(1, random_1, 120)
random_3 =  runif(1, 85, 120)
random_4 =  runif(1, random_3, 120)

#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))

train_data\$cat = as.factor(train_data\$cat)

#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)

b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)

c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)

split_1 =  runif(1,0, 1)
split_2 =  runif(1, 0, 1)
split_3 =  runif(1, 0, 1)

#calculate 60th quantile ("quant") for each bin

table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))

table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))

table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))

#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a\$diff = ifelse(table_a\$quant > table_a\$c1,1,0)
table_b\$diff = ifelse(table_b\$quant > table_b\$c1,1,0)
table_c\$diff = ifelse(table_c\$quant > table_c\$c1,1,0)

#group all tables

final_table = rbind(table_a, table_b, table_c)

#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))

#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table\$diff)))

#format this table: add the random criteria to this table for reference
final_table_2\$random_1 = random_1

final_table_2\$random_2 = random_2

final_table_2\$random_3 = random_3

final_table_2\$random_4 = random_4

final_table_2\$split_1 = split_1

final_table_2\$split_2 = split_2

final_table_2\$split_3 = split_3

final_table_2\$iteration_number = i

results_table <- rbind(results_table, final_table_2)

final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

}

#view output

final_results
``````

Problem : Now, I am trying to re-write the loop so that over the course of the loop:

• for each unique iteration:
• the "results_table" only keeps the rows corresponding to those iterations with the 5 largest value of "total mean"

Goal: For the "results_table" shown above, each "black box" represents a set of iterations, and each "red box" shows the value of the "total mean". To prevent the "results_table" from growing in size with each iteration, I would only like to keep rows corresponding to the 5 biggest values (in this example indicated with the red boxes).

I tried to combine "results_table" and "final_table" into a single step using the following code:

``````#code2
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

####
results_table <- data.frame()

for (i in 1:10 ) {

#generate random numbers
random_1 =  runif(1, 80, 120)
random_2 =  runif(1, random_1, 120)
random_3 =  runif(1, 85, 120)
random_4 =  runif(1, random_3, 120)

#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))

train_data\$cat = as.factor(train_data\$cat)

#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)

b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)

c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)

split_1 =  runif(1,0, 1)
split_2 =  runif(1, 0, 1)
split_3 =  runif(1, 0, 1)

#calculate 60th quantile ("quant") for each bin

table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))

table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))

table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))

#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a\$diff = ifelse(table_a\$quant > table_a\$c1,1,0)
table_b\$diff = ifelse(table_b\$quant > table_b\$c1,1,0)
table_c\$diff = ifelse(table_c\$quant > table_c\$c1,1,0)

#group all tables

final_table = rbind(table_a, table_b, table_c)

#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))

#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table\$diff)))

#format this table: add the random criteria to this table for reference
final_table_2\$random_1 = random_1

final_table_2\$random_2 = random_2

final_table_2\$random_3 = random_3

final_table_2\$random_4 = random_4

final_table_2\$split_1 = split_1

final_table_2\$split_2 = split_2

final_table_2\$split_3 = split_3

final_table_2\$iteration_number = i

results_table <- rbind(results_table, final_table_2)

results_table = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

#keep the 5 biggest results (according to the "total" variable)

}

#view output (should only have 5 rows)

results_table
``````

But this results in an error and outputs a table with only one row:

``````Error in rbindlist(l, use.names, fill, idcol) :
Item 2 has 10 columns, inconsistent with item 1 which has 12 columns. To fill missing columns use fill=TRUE.

#view output (should only have 5 rows)

results_table

iteration_number random_1 random_2 random_3 random_4   split_1   split_2   split_3         a         b         c total
1:                1 95.67371 111.8133 94.00313 102.0569 0.8404564 0.6882731 0.7749321 0.8205128 0.6870229 0.7734554  0.73
``````

Can someone please show me how to fix this problem? Why are these two codes producing different outputs?

Thanks

Hi,

Apart from the fact that I think you can make this code a bit more efficient (but that's not the point), you can just add the desired filtering by adding this one line right after you update `final_results ` each iteration:

``````  ... prev code ...

#Combine results
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
#Sort by total (descending) and only keep the top 5
final_results = final_results %>% arrange(desc(total)) %>% slice(1:5)

}

> final_results
iteration_number  random_1 random_2  random_3  random_4    split_1   split_2   split_3
1:                3  81.02645 110.4645 116.42006 119.61718 0.11943576 0.9762721 0.9100522
2:                8 102.17487 117.1701  95.93786  96.80284 0.81599406 0.7785768 0.8593795
3:                2  92.31360 110.0762 106.46871 109.53428 0.24615922 0.8777580 0.7847697
4:                1  95.67371 111.8133  94.00313 102.05692 0.84045638 0.6882731 0.7749321
5:                4  90.35986 116.7089 114.15588 116.72312 0.07675141 0.8661540 0.3236617
a         b         c total
1: 0.14285714 0.9758162 0.9103448 0.943
2: 0.81300813 0.7795276 0.8586667 0.843
3: 0.24731183 0.8777429 0.7840909 0.744
4: 0.82051282 0.6870229 0.7734554 0.730
5: 0.08139535 0.8658065 0.3207547 0.702
``````

Hope this helps,
PJ

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.