Making a Loop more Efficient

swaheera · July 5, 2021, 6:05am

I am working with R. I wrote the following loop over here (using some randomly created data) that iterates through some data manipulation steps and produces a desired table called "final_results" :

#load library
    library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:10 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

In the above code, I ran the loop 10 times. In the future, I would be interested in running this loop around 1,000,000 times. Apart from buying a stronger computer, is it possible that this code can be re-written in such a way that makes it less "heavy" for the computer to process? Can this code be made more efficient by storing the intermediate steps a different way? Is there anything that can be done to speedup the runtime for this code?

Thanks

startz · July 6, 2021, 4:08pm

Run the profiler on your code to see which lines are taking the most time.

My guess would be that the runif() functions are expensive and that it would speed things up to generate all your random numbers outside the loop, save them in a vector, and then get the next random number by indexing the vector. But humans are notoriously bad at guessing what part of code is expensive, hence the suggestion to run the profiler.

michaelbgarcia · July 7, 2021, 4:48pm

So you don't really need a loop here. I think you can get everything you want with normal vectorized operations. Pair that with the speed of data.table and you are golden. I see you wanted to use data.table, but the constant switching between classes renders it useless, except for that last dcast part. Here is a mostly pure data.table solution. Note, I am no data.table expert so am positive this could be optimized even further.

I used a starting point of 1000 iterations, which completes in 2 seconds.



library(data.table)
library(tictoc)

tic() # start timer

# Change this for # of iterations
num_iteration = 1000

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)

# create train dt
train_data = data.table(a1,b1,c1, dum = 1)

# create template table with iterations
i = data.table(iteration= 1:num_iteration, dum = 1)

# full cartesian join to get all combinations of iterations and train data
df = train_data[i, on = "dum", allow.cartesian = TRUE]

# set key
setkey(df, key = "iteration")

df[, random_1 := runif(1, 80, 120), by = "iteration"]
df[, random_2 := runif(1, random_1, 120), by = "iteration"]
df[, random_3 := runif(1, 85, 120), by = "iteration"]
df[, random_4 := runif(1, random_3, 120), by = "iteration"]

df[, split_1 :=  runif(1, 0, 1), by = "iteration"]
df[, split_2 :=  runif(1, 0, 1), by = "iteration"]
df[, split_3 :=  runif(1, 0, 1), by = "iteration"]

df[, cat := ifelse(a1 <= random_1 & b1 <= random_3, "a", 
                                         ifelse(a1 <= random_2 & b1 <= random_4, "b",
                                                "c")
                            )]

df[cat == "a", quant := quantile(c1, prob = split_1)]
df[cat == "a", diff := quant > c1]

df[cat == "b", quant := quantile(c1, prob = split_2)]
df[cat == "b", diff := quant > c1]
  
df[cat == "c", quant := quantile(c1, prob = split_3)]
df[cat == "c", diff := quant > c1]

df_agg = df[,.(mean =mean(diff, na.rm = TRUE)), by = .(iteration, cat)]

df_agg[, total := mean(mean, na.rm = TRUE), by = .(iteration)]

df_cast = dcast(df_agg, iteration + total ~ cat, value.var = 'mean')

df_unique = unique(df, by = c("iteration"))[,c("iteration", "random_1", "random_2", "random_3", "split_1", "split_2", "split_3")]

final_results = df_cast[df_unique, on="iteration", nomatch=NULL] 

print(final_results)
#>       iteration     total          a         b          c  random_1
#>    1:         1 0.3080857 0.00000000 0.8219178 0.10233918  95.67371
#>    2:         2 0.4748137 0.73825503 0.6111111 0.07507508 106.53849
#>    3:         3 0.1973123 0.01744186 0.4356061 0.13888889  90.28929
#>    4:         4 0.5142243 0.55625000 0.4576271 0.52879581  92.39194
#>    5:         5 0.3877325 0.07665904 0.4615385 0.62500000 113.61826
#>   ---                                                              
#>  996:       996 0.4428956 0.35434783 0.8036072 0.17073171  99.15899
#>  997:       997 0.5613058 0.66666667 0.2962963 0.72095436 118.62009
#>  998:       998 0.4462901 0.26744186 0.3333333 0.73809524 116.54816
#>  999:       999 0.5215304 0.41428571 0.8586387 0.29166667  89.12317
#> 1000:      1000 0.4498091 0.90384615 0.2115385 0.23404255  92.29038
#>       random_2  random_3    split_1   split_2    split_3
#>    1: 104.1073  86.14198 0.22345413 0.8230599 0.09045242
#>    2: 107.1270  98.86750 0.72568845 0.7255776 0.07787107
#>    3: 118.9194 118.56232 0.01756418 0.4398307 0.19976866
#>    4: 114.9078 103.28158 0.54330321 0.4567068 0.53717210
#>    5: 117.9242 108.58274 0.08132953 0.4804348 0.62339557
#>   ---                                                   
#>  996: 118.6605 115.49544 0.36560142 0.7991917 0.12332451
#>  997: 119.9159  88.58999 0.87070697 0.2470474 0.70984699
#>  998: 117.9547 118.84771 0.27297641 0.4942125 0.76537063
#>  999: 112.9656 118.21472 0.39332239 0.8632909 0.33156372
#> 1000: 103.0477 117.78547 0.89226660 0.2224833 0.21636146

toc() # End timer
#> 2.14 sec elapsed

^{Created on 2021-07-07 by the reprex package (v0.3.0)}

system · July 28, 2021, 4:49pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.