Can someone please explain the difference between these two codes?

swaheera · July 6, 2021, 2:01am

I generated some random data and wrote the following code which loops through a series of data manipulation steps and produces a table containing the results ("final_results"):

#code1
#load library
    library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
   
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')

    
}

#view output 

final_results

Problem : Now, I am trying to re-write the loop so that over the course of the loop:

for each unique iteration:
the "results_table" only keeps the rows corresponding to those iterations with the 5 largest value of "total mean"

E.g. https://i.stack.imgur.com/EyUX3.png

Goal: For the "results_table" shown above, each "black box" represents a set of iterations, and each "red box" shows the value of the "total mean". To prevent the "results_table" from growing in size with each iteration, I would only like to keep rows corresponding to the 5 biggest values (in this example indicated with the red boxes).

I tried to combine "results_table" and "final_table" into a single step using the following code:

#code2
#load library
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    
    results_table = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
    #keep the 5 biggest results (according to the "total" variable)
    results_table <- head(results_table[order(-total)], 5)
    
}

#view output (should only have 5 rows)

results_table

But this results in an error and outputs a table with only one row:

Error in rbindlist(l, use.names, fill, idcol) : 
  Item 2 has 10 columns, inconsistent with item 1 which has 12 columns. To fill missing columns use fill=TRUE.

 #view output (should only have 5 rows)

  results_table
 
  iteration_number random_1 random_2 random_3 random_4   split_1   split_2   split_3         a         b         c total
1:                1 95.67371 111.8133 94.00313 102.0569 0.8404564 0.6882731 0.7749321 0.8205128 0.6870229 0.7734554  0.73

Can someone please show me how to fix this problem? Why are these two codes producing different outputs?

Thanks

pieterjanvc · July 8, 2021, 12:36pm

Hi,

Apart from the fact that I think you can make this code a bit more efficient (but that's not the point), you can just add the desired filtering by adding this one line right after you update final_results each iteration:

  ... prev code ...

  #Combine results
  final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
  #Sort by total (descending) and only keep the top 5
  final_results = final_results %>% arrange(desc(total)) %>% slice(1:5)
  
}

> final_results
   iteration_number  random_1 random_2  random_3  random_4    split_1   split_2   split_3
1:                3  81.02645 110.4645 116.42006 119.61718 0.11943576 0.9762721 0.9100522
2:                8 102.17487 117.1701  95.93786  96.80284 0.81599406 0.7785768 0.8593795
3:                2  92.31360 110.0762 106.46871 109.53428 0.24615922 0.8777580 0.7847697
4:                1  95.67371 111.8133  94.00313 102.05692 0.84045638 0.6882731 0.7749321
5:                4  90.35986 116.7089 114.15588 116.72312 0.07675141 0.8661540 0.3236617
            a         b         c total
1: 0.14285714 0.9758162 0.9103448 0.943
2: 0.81300813 0.7795276 0.8586667 0.843
3: 0.24731183 0.8777429 0.7840909 0.744
4: 0.82051282 0.6870229 0.7734554 0.730
5: 0.08139535 0.8658065 0.3207547 0.702

Hope this helps,
PJ

system · July 29, 2021, 12:36pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.