Using “Expand.Grid” and “mapply” to evaluate functions

swaheera · July 28, 2021, 5:29am

I am working with the R programming language. I am trying to evaluate a function I defined, at points contained in two grids I defined.

First, I created some sample data for this problem:

#load library
library(dplyr)
library(data.table)


# create some data for this example
set.seed(123)
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

Then, I defined the two grids:

#grid_1 - does not work for some reason
random_1 <- seq(80,100,5)
random_2 <- seq(random_1,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(random_3,120,5)
split_1 =  seq(0,1,0.5)
split_2 =  seq(0,1,0.5)
split_3 =  seq(0,1,0.5)
DF_2 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)


#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 =  seq(0,1,0.5)
split_2 =  seq(0,1,0.5)
split_3 =  seq(0,1,0.5)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)

Next, I defined the function ("grid_function") that I want to evaluate at the points contained in these two grids:

#### define function
results_table <- data.frame()

grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    

    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    

    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

Question: Now, I am trying to evaluate "grid_function" at every point contained within "DF_1" and "DF_2". I tried two different ways to do this, but nothing seemed to work.

Method 1 :

group_by(DF_2, random_1, random_2, random_3, random_4, split_1, split_2, split_3) %>% dplyr::mutate(z = grid_function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))

Error: Must group by variables found in `.data`.
* Column `random_1` is not found.
* Column `random_2` is not found.
* Column `random_3` is not found.
* Column `random_4` is not found.
* Column `split_1` is not found.
* Column `split_2` is not found.
* Column `split_3` is not found.
Run `rlang::last_error()` to see where the error occurred.

Method 2:

 do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3)) , unname(DF_2)))

Error: unexpected ')' in "do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))"

Can someone please show me how to fix this?

Thanks

Yarnabrina · July 28, 2021, 7:46am

I'm not sure what you want, because the final calculation of your function is going to return a data.table instead of a scalar. May be that's what your requirement is, but I think that's not a "minimal" example for your question in the title.

Are you looking for something like this?

# define different possible values for parameters
x <- 1:2
y <- seq(from = 0.25, to = 1, by = 0.25)
z <- letters[3:5]

# define the entire space of the parameters
space <- expand.grid(x, y, z)

# define function with parameters being arguments
fn <- function(arg1, arg2, arg3) {
  arg_vals <- c(arg1, arg2, arg3)
  args <- c("arg1", "arg2", "arg3")
  paste(args, arg_vals, sep = ":", collapse = ",")
}

# apply function
space["do_call_apply"] <- apply(
  X = space,
  MARGIN = 1,
  FUN = \(x) do.call(
    what = fn,
    args = unname(
      obj = as.list(x = x),
      force = TRUE
    )
  )
)

# results
space
#>    Var1 Var2 Var3           do_call_apply
#> 1     1 0.25    c arg1:1,arg2:0.25,arg3:c
#> 2     2 0.25    c arg1:2,arg2:0.25,arg3:c
#> 3     1 0.50    c arg1:1,arg2:0.50,arg3:c
#> 4     2 0.50    c arg1:2,arg2:0.50,arg3:c
#> 5     1 0.75    c arg1:1,arg2:0.75,arg3:c
#> 6     2 0.75    c arg1:2,arg2:0.75,arg3:c
#> 7     1 1.00    c arg1:1,arg2:1.00,arg3:c
#> 8     2 1.00    c arg1:2,arg2:1.00,arg3:c
#> 9     1 0.25    d arg1:1,arg2:0.25,arg3:d
#> 10    2 0.25    d arg1:2,arg2:0.25,arg3:d
#> 11    1 0.50    d arg1:1,arg2:0.50,arg3:d
#> 12    2 0.50    d arg1:2,arg2:0.50,arg3:d
#> 13    1 0.75    d arg1:1,arg2:0.75,arg3:d
#> 14    2 0.75    d arg1:2,arg2:0.75,arg3:d
#> 15    1 1.00    d arg1:1,arg2:1.00,arg3:d
#> 16    2 1.00    d arg1:2,arg2:1.00,arg3:d
#> 17    1 0.25    e arg1:1,arg2:0.25,arg3:e
#> 18    2 0.25    e arg1:2,arg2:0.25,arg3:e
#> 19    1 0.50    e arg1:1,arg2:0.50,arg3:e
#> 20    2 0.50    e arg1:2,arg2:0.50,arg3:e
#> 21    1 0.75    e arg1:1,arg2:0.75,arg3:e
#> 22    2 0.75    e arg1:2,arg2:0.75,arg3:e
#> 23    1 1.00    e arg1:1,arg2:1.00,arg3:e
#> 24    2 1.00    e arg1:2,arg2:1.00,arg3:e

HanOostdijk · July 28, 2021, 8:24am

Hello @Yarnabrina ,

excellent example!
And I also like your remark about posting a (most) minimal example of the problem for this OP.

swaheera · July 28, 2021, 4:40pm

Thank you everyone for your replies! Just to clarify, here is what I am trying to do:

I wrote this loop that evaluates the following "function: (it's actually a "loop") 100 times, at randomly selected inputs for "random_1, random_2, random_3, random_4, split_1, split_2, split_3":


#load library
library(dplyr)

library(data.table)

set.seed(123)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


####
results_table <- data.frame()

for (i in 1:100 ) {
    
    #generate random numbers
    random_1 =  runif(1, 80, 120)
    random_2 =  runif(1, random_1, 120)
    random_3 =  runif(1, 85, 120)
    random_4 =  runif(1, random_3, 120)
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    split_1 =  runif(1,0, 1)
    split_2 =  runif(1, 0, 1)
    split_3 =  runif(1, 0, 1)
    
    #calculate 60th quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

Here is how the results of the above code look like:

head(final_results)
   iteration_number random_1 random_2  random_3 random_4    split_1   split_2   split_3          a         b         c total
1:                1 95.67371 111.8133  94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730
2:                2 92.31360 110.0762 106.46871 109.5343 0.24615922 0.8777580 0.7847697 0.24731183 0.8777429 0.7840909 0.744
3:                3 81.02645 110.4645 116.42006 119.6172 0.11943576 0.9762721 0.9100522 0.14285714 0.9758162 0.9103448 0.943
4:                4 90.35986 116.7089 114.15588 116.7231 0.07675141 0.8661540 0.3236617 0.08139535 0.8658065 0.3207547 0.702
5:                5 89.28374 114.7103 119.70448 119.7725 0.08881443 0.6351936 0.8565509 0.09027778 0.6349614 0.8461538 0.573
6:                6 87.35767 103.8575  97.44462 116.0414 0.48372890 0.2319129 0.2701634 0.47368421 0.2326333 0.2711370 0.255

Here is my question: Instead of evaluating the above function at "randomly selected points", I want to evaluate this function at points defined within the a grid.

First, I defined the gird:

#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 =  seq(0.4,1,0.2)
split_2 =  seq(0.4,1,0.2)
split_3 =  seq(0.4,1,0.2)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)

> head(DF_1)
  Var1 Var2 Var3 Var4 Var5 Var6 Var7
1   80   85   85   90  0.4  0.4  0.4
2   85   85   85   90  0.4  0.4  0.4
3   90   85   85   90  0.4  0.4  0.4
4   95   85   85   90  0.4  0.4  0.4
5  100   85   85   90  0.4  0.4  0.4
6   80   90   85   90  0.4  0.4  0.4

Next, I converted that "loop" into a "function"


results_table <- data.frame()

grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    

    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    

    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    final_table_2$iteration_number = i
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

Problem: Instead of evaluating the function at randomly selected points, such as " 95.67371 111.8133 94.00313 102.0569 0.84045638 0.6882731 0.7749321 0.82051282 0.6870229 0.7734554 0.730" , I want to evaluate the function at points defined within the grid, e.g. " 80 85 85 90 0.4 0.4 0.4"

Can you please show me how to do this?

Thank you so much for your help!

HanOostdijk · July 28, 2021, 5:00pm

Thank you for restating your problem (as a non-minimal example).
See again to the answer of @Yarnabrina for the solution

swaheera · July 28, 2021, 5:10pm

Thank you for your reply! I am trying to apply the answer kindly provided by @Yarnabrina , but I am having some difficulties. Can you please suggest something?

I will keep trying in the meantime.

Just want to thank everyone for their help! It is very kind!

swaheera · July 29, 2021, 6:47am

@Yarnabrina : thank you for your reply! I was able to figure out how to solve this problem! I will post the code in the morning

swaheera · July 29, 2021, 2:47pm

Please see the code below:

library(dplyr)
library(data.table)

results_table <- data.frame()

grid_function <- function(train_data, random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
    
    
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    
    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1, prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    
    #create a table: for each bin, calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    
    
    
    results_table <- rbind(results_table, final_table_2)
    
    final_results = dcast(setDT(results_table), random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
    
}

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

#grid


#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 =  seq(0,1,0.1)
split_2 =  seq(0,1,0.1)
split_3 =  seq(0,1,0.1)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)

colnames(DF_1) <- c("random_1" , "random_2", "random_3",                     "random_4", "split_1", "split_2", "split_3")

train_data_new <- copy(train_data)


resultdf1 <- apply(DF_1,1, # 1 means rows
                   FUN=function(x){
                       do.call(
                           # Call Function grid_function2 with the arguments in
                           # a list
                           grid_function,
                           # force list type for the arguments
                           c(list(train_data_new), as.list(
                               # make the row to a named vector
                               unlist(x)
                           )
                           ))
                   }
)

l = resultdf1
final_output = rbindlist(l, fill = TRUE)

system · August 19, 2021, 2:47pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.