I wrote the following code in R which performs (loop) a series of data manipulation operations on some artificially generated data (the final output is called "final_results"):
#load library
library(dplyr)
library(data.table)
set.seed(123)
# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
####
results_table <- data.frame()
for (i in 1:10 ) {
#generate random numbers
random_1 = runif(1, 80, 120)
random_2 = runif(1, random_1, 120)
random_3 = runif(1, 85, 120)
random_4 = runif(1, random_3, 120)
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
split_1 = runif(1,0, 1)
split_2 = runif(1, 0, 1)
split_3 = runif(1, 0, 1)
#calculate 60th quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
The above loop works perfectly fine - but I am trying to learn more about R and trying to re-write this loop using other functions from other libraries such as the "doParallel", "foreach"
and "purrr"
libraries.
Option 1:
I came across the following code in R which shows the general template for writing loops using the "purrr" library (apparently "map_df" is a function that uses the code from the loop):
#option 1
library(dplyr)
library(purrr)
library(tictoc)
data_gen <- function(){ #here you insert your data generating process
tibble(
x = runif(100),
y = runif(100)
)
}
N <- 10000 #number of datasets do be generated
tic('method A') #not necessary, measures the time of the code between 'tic' and 'toc'
output <- tibble(
i = 1:N
) %>%
split(.$i) %>%
map_df(
~data_gen()
)
toc()
However, I am not sure how this code can be adapted to fit my example. I first created the map_df
function:
#create map_df function:
map_df <- function() {
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
split_1 = runif(1,0, 1)
split_2 = runif(1, 0, 1)
split_3 = runif(1, 0, 1)
#calculate 60th quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
But when I try to run the general template, it produces the following error:
data_gen <- function(){ #here you insert your data generating process
tibble(
# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
)
}
N <- 10000 #number of datasets do be generated
tic('method A') #not necessary, measures the time of the code between 'tic' and 'toc'
output <- tibble(
i = 1:N
) %>%
split(.$i) %>%
map_df(
~data_gen()
)
toc()
Error in map_df(., ~data_gen()) : unused arguments (., ~data_gen())
Does anyone know why this error is being produced?
Option 2 :
I am not sure how the "doParallel" and "foreach" libraries can be used in my example. It seems that all examples with "doParallel" require the user to begin by defining the number of "cores" they wish the computer will use:
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
And in the end, the user has to instruct the computer to stop the process:
stopCluster(cl)
Beyond this, I am not sure how the "doParalell" and the "foreach" library can be used to benefit my example.
Can someone please show me this? Thanks