Filtering a dataframe in a loop and apply a function or applying a function on each element of a dataframe (?)

Hi @Sanjmeh and @nirgrahamuk
Hereunder a shorter version
(I cannot do dput(results) because the output is too long)

rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#> Error: RStudio not running
getwd()
#> [1] "C:/Users/Angela/AppData/Local/Temp/RtmpaYEQ7v/reprex-62941f5223ea-hurt-puma"

#load required packages 
library(mc2d)
#> Loading required package: mvtnorm
#> 
#> Attaching package: 'mc2d'
#> The following objects are masked from 'package:base':
#> 
#>     pmax, pmin
library(gplots)
#> 
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#> 
#>     lowess
library(RColorBrewer)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(reprex)
library(tidyverse)
set.seed(99)
iters<-1000

df<-data.frame(id=c(1:30),month=c(1:6,1,6,4,1,5,2,3,2,5,4,6,3:6,4:6,1:5,5),n=rpois(30,5))

df$n[df$n == "0"] <- 3
se<-rbeta(iters,96,6)
rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#> Error: RStudio not running
#> Error: RStudio not running
getwd()
#> [1] "C:/Users/Angela/AppData/Local/Temp/RtmpaYEQ7v/reprex-62941f5223ea-hurt-puma"
#> [1] "C:/Users/Angela/AppData/Local/Temp/RtmpyYediY/reprex-c2c7c6e57f6-bosky-moa"

#load required packages 
library(mc2d)
#> Loading required package: mvtnorm
#> 
#> Attaching package: 'mc2d'
#> The following objects are masked from 'package:base':
#> 
#>     pmax, pmin
library(gplots)
#> 
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#> 
#>     lowess
library(RColorBrewer)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(reprex)
library(tidyverse)
set.seed(99)
iters<-1000

df<-data.frame(id=c(1:30),month=c(1:6,1,6,4,1,5,2,3,2,5,4,6,3:6,4:6,1:5,5),n=rpois(30,5))

df$n[df$n == "0"] <- 3
se<-rbeta(iters,96,6)
epi.a<-rpert(iters,min=1.5, mode=2, max=3)
p=0.2
p2=epi.a*p

##my try the idea is to loop over the months 

results<-data.frame(m1=numeric(iters))
results<-cbind(results,rep(results[1],5))
colnames(results)<-paste("m", sep = "_", 1:6)

for (j in 1:6) {
  for (i in 1:iters) {
    if (df$month[i]== "1")results$m_1[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
    else if (df$month[i]== "2")results$m_2[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
    else if(df$month[i]== "3")results$m_3[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
    else if(df$month[i]== "4")results$m_4[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
    else if(df$month[i]== "5")results$m_5[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
    else if(df$month[i]== "6")results$m_6[i]<- 1 - (1 - se[i] * p2[i])^df$n[i]
  }
  
}
#> Error in if (df$month[i] == "1") results$m_1[i] <- 1 - (1 - se[i] * p2[i])^df$n[i] else if (df$month[i] == : missing value where TRUE/FALSE needed

Created on 2022-05-03 by the reprex package (v2.0.1)

Hi @nirgrahamuk
The month is not supposed to be an element of the function.
It is just used to to subset the data.
Then for each month the function should be applied.
Do you know I can code it ?
Thanks
Angela

I think I happened to have some experience dealing with this situation.

Fortunately, with the help of nested data feature form dplyr, you don't need to split the original data.frame and iter through every new dataframe, you just need to mutate it a list-columns, in the new list-column, each element contains a full set of the result correspond to information from each row.

Besides, we need a little help with the map function from purrr package to simplify the iteration coding.

Is this what you want?

library(tidyverse)

df <- tibble::tribble( 
  ~id,     ~c, ~month,  ~n,             ~s,
  10076, 'Other',     1,  1, 'Other_Breeder',
  10233, 'Other',     1,  1, 'Other_Breeder',
  15590, 'Other',     1,  1, 'Other_Breeder',
  20373, 'Other',     1,  1, 'Other_Breeder',
  21161, 'Other',     1,  1, 'Other_Breeder',
  22057, 'Other',     1,  1, 'Other_Breeder',
  22929, 'Other',     1,  1, 'Other_Breeder',
)

set.seed(777)

a <- rnorm(10000)

s <- rnorm(10000,100,50)

# this defined function ensures any `n` from `df` will be itered with 10000 s and a and generated 10000 results
iter_n <- function(n) map2_dbl(.x = a, .y = s, ~ 1 - (1 - .x * .y) ^ n)

df_1 <- df %>% mutate(Result = map(n, ~iter_n(.x)))

df_1
# A tibble: 7 x 6
     id c     month     n s             Result        
  <dbl> <chr> <dbl> <dbl> <chr>         <list>        
1 10076 Other     1     1 Other_Breeder <dbl [10,000]>
2 10233 Other     1     1 Other_Breeder <dbl [10,000]>
3 15590 Other     1     1 Other_Breeder <dbl [10,000]>
4 20373 Other     1     1 Other_Breeder <dbl [10,000]>
5 21161 Other     1     1 Other_Breeder <dbl [10,000]>
6 22057 Other     1     1 Other_Breeder <dbl [10,000]>
7 22929 Other     1     1 Other_Breeder <dbl [10,000]>

df_1 %>% unnest(Result)
# A tibble: 70,000 x 6
      id c     month     n s             Result
   <dbl> <chr> <dbl> <dbl> <chr>          <dbl>
 1 10076 Other     1     1 Other_Breeder   4.64
 2 10076 Other     1     1 Other_Breeder -39.0 
 3 10076 Other     1     1 Other_Breeder 115.  
 4 10076 Other     1     1 Other_Breeder -46.2 
 5 10076 Other     1     1 Other_Breeder 362.  
 6 10076 Other     1     1 Other_Breeder  48.1 
 7 10076 Other     1     1 Other_Breeder  10.8 
 8 10076 Other     1     1 Other_Breeder 128.  
 9 10076 Other     1     1 Other_Breeder -10.8 
10 10076 Other     1     1 Other_Breeder -45.2 
# ... with 69,990 more rows

you can as well treat the list-column to some extended calculations:

df_1 %>% mutate(Result %>% map_dfr( ~ data.frame(
  Mean = mean(.x),
  Q_025 = quantile(.x, 0.025),
  Q_975 = quantile(.x, 0.975)
)))
# A tibble: 7 x 9
     id c     month     n s             Result          Mean Q_025 Q_975
  <dbl> <chr> <dbl> <dbl> <chr>         <list>         <dbl> <dbl> <dbl>
1 10076 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
2 10233 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
3 15590 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
4 20373 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
5 21161 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
6 22057 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.
7 22929 Other     1     1 Other_Breeder <dbl [10,000]> -1.52 -241.  234.

Hi @yifanliu thanks a lot .. I think that it is exactly what I was looking for !!! however I have tried the code with my dataframe and though everything seems to go smoothly , with no errors, the results columns is still a list and the unnest function doesn't work .. how can I access the results for each month? I think that the part of grouping per months is also missing.. the idea is to have a vector of 10000 values for each month which may be stored in the list ( which should thus contain 6 elements equal to the number of months) Thanks a lot
Angela

the list-column is brief in code but easily lost, better makes sure that each element in your Result list-column is not another list, otherwise you should use unnest function repeatedly.

in short, little change in the code will result in a compeletely different data structure of Result:

df_1 <- df %>% mutate(Result = map(n, ~iter_n(.x)))
> df_1
# A tibble: 7 x 6
     id c     month     n s             Result        
  <dbl> <chr> <dbl> <dbl> <chr>         <list>        
1 10076 Other     1     1 Other_Breeder <dbl [10,000]>
2 10233 Other     1     1 Other_Breeder <dbl [10,000]>
3 15590 Other     1     1 Other_Breeder <dbl [10,000]>
4 20373 Other     1     1 Other_Breeder <dbl [10,000]>
5 21161 Other     1     1 Other_Breeder <dbl [10,000]>
6 22057 Other     1     1 Other_Breeder <dbl [10,000]>
7 22929 Other     1     1 Other_Breeder <dbl [10,000]>

df_2 <- df %>% mutate(Result = map(n, ~ list(iter_n(.x))))
> df_2
# A tibble: 7 x 6
     id c     month     n s             Result    
  <dbl> <chr> <dbl> <dbl> <chr>         <list>    
1 10076 Other     1     1 Other_Breeder <list [1]>
2 10233 Other     1     1 Other_Breeder <list [1]>
3 15590 Other     1     1 Other_Breeder <list [1]>
4 20373 Other     1     1 Other_Breeder <list [1]>
5 21161 Other     1     1 Other_Breeder <list [1]>
6 22057 Other     1     1 Other_Breeder <list [1]>
7 22929 Other     1     1 Other_Breeder <list [1]>

df_2 %>% unnest(Result)
# A tibble: 7 x 6
     id c     month     n s             Result        
  <dbl> <chr> <dbl> <dbl> <chr>         <list>        
1 10076 Other     1     1 Other_Breeder <dbl [10,000]>
2 10233 Other     1     1 Other_Breeder <dbl [10,000]>
3 15590 Other     1     1 Other_Breeder <dbl [10,000]>
4 20373 Other     1     1 Other_Breeder <dbl [10,000]>
5 21161 Other     1     1 Other_Breeder <dbl [10,000]>
6 22057 Other     1     1 Other_Breeder <dbl [10,000]>
7 22929 Other     1     1 Other_Breeder <dbl [10,000]>

df_2 %>% unnest(Result) %>% unnest(Result)
# A tibble: 70,000 x 6
      id c     month     n s             Result
   <dbl> <chr> <dbl> <dbl> <chr>          <dbl>
 1 10076 Other     1     1 Other_Breeder   4.64
 2 10076 Other     1     1 Other_Breeder -39.0 
 3 10076 Other     1     1 Other_Breeder 115.  
 4 10076 Other     1     1 Other_Breeder -46.2 
 5 10076 Other     1     1 Other_Breeder 362.  
 6 10076 Other     1     1 Other_Breeder  48.1 
 7 10076 Other     1     1 Other_Breeder  10.8 
 8 10076 Other     1     1 Other_Breeder 128.  
 9 10076 Other     1     1 Other_Breeder -10.8 
10 10076 Other     1     1 Other_Breeder -45.2 
# ... with 69,990 more rows

As to the grouping, though I don't think the grouping is needed inside a nested data.frame, it is also applicable (here the month variable only has one value, so only one group existed):

list_1 <- df %>% group_by(month) %>% mutate(Result = map(n, ~iter_n(.x))) %>% group_split()
list_1
<list_of<
  tbl_df<
    id    : double
    c     : character
    month : double
    n     : double
    s     : character
    Result: list
  >
>[1]>
[[1]]
# A tibble: 7 x 6
     id c     month     n s             Result        
  <dbl> <chr> <dbl> <dbl> <chr>         <list>        
1 10076 Other     1     1 Other_Breeder <dbl [10,000]>
2 10233 Other     1     1 Other_Breeder <dbl [10,000]>
3 15590 Other     1     1 Other_Breeder <dbl [10,000]>
4 20373 Other     1     1 Other_Breeder <dbl [10,000]>
5 21161 Other     1     1 Other_Breeder <dbl [10,000]>
6 22057 Other     1     1 Other_Breeder <dbl [10,000]>
7 22929 Other     1     1 Other_Breeder <dbl [10,000]>

About more detail and trick in dealing with list-column, I recommend you read vignettes_nest and tidyr cheatsheet the to learn more.

Hi @yifanliu thanks for the quick reply , How can I make sure that the list does not contain other lists ? Additionally, I do not understand why I get this large result column ..

rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#> Error: RStudio not running
getwd()
#> [1] "C:/Users/Angela/AppData/Local/Temp/RtmpcxEkYr/reprex-33c7ddb6337-waspy-bunny"

#load required packages 
library(mc2d)
#> Loading required package: mvtnorm
#> 
#> Attaching package: 'mc2d'
#> The following objects are masked from 'package:base':
#> 
#>     pmax, pmin
library(gplots)
#> 
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#> 
#>     lowess
library(RColorBrewer)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(reprex)
library(tidyverse)
set.seed(99)
iters<-1000

df<-data.frame(id=c(1:30),month=c(1:6,1,6,4,1,5,2,3,2,5,4,6,3:6,4:6,1:5,5),n=rpois(30,5))

df$n[df$n == "0"] <- 3
df<-as_tibble(df)
se<-rbeta(iters,96,6)
epi.a<-rpert(iters,min=1.5, mode=2, max=3)
p=0.2
p2=epi.a*p

# this defined function ensures any `n` from `df` will be itered with 10000 s and a and generated 10000 results
iter_n <- function(n) map2_dbl(.x = se, .y = p2, ~ 1 - (1 - .x * .y) ^ n)
list_1 <- df %>% group_by(month) %>% mutate(Result = map(n, ~iter_n(.x))) %>% group_split()
list_1[[1]]
#> # A tibble: 4 x 4
#>      id month     n Result       
#>   <int> <dbl> <dbl> <list>       
#> 1     1     1     5 <dbl [1,000]>
#> 2     7     1     6 <dbl [1,000]>
#> 3    10     1     3 <dbl [1,000]>
#> 4    25     1     7 <dbl [1,000]>

Created on 2022-05-05 by the reprex package (v2.0.1)

Moreover to be easier to deal with , my desired output for the result column is a list with six elements (one for each month) e.g.

iters<-1000
m1<-numeric(iters)
m2<-numeric(iters)
m3<-numeric(iters)
m4<-numeric(iters)
m5<-numeric(iters)
m6<-numeric(iters)

Result<-list(m1,m2,m3,m4,m5,m6)

Created on 2022-05-05 by the reprex package (v2.0.1)

Do you think it is possible to get something like that?

Just examine the output every time is the simplest way I think. Make sure your further treatment to the result matches its datatype.

it is also applicable as you pull out the Result column from each element of the list (here refers to each data.frame):

list_1 <- df %>% mutate(Result = map(n, ~iter_n(.x))) %>% unnest(Result) %>% 
  group_split(month) %>% map(~ .x %>% pull(Result))
list_1

notice that according to your sample data, var month is duplicated, so length for each element of list_1 is not equal (infected by the group size). If you want to keep sure the Result list has some equal-sized elements, directly pull out the list-column is enough, do this:

list_2 <- df %>% mutate(Result = map(n, ~iter_n(.x))) %>% pull(Result)
list_2

but because the list-column is also convenient to iterate for further application, I do advise you keep the nested data structure.

Thank you for the great support you are providing !!! :grinning:
I'll try the new code right away and let you know,
For the result column I'd like to avoid to get lists inside a list
more than make sure of the structure of the result column inspecting it .. how could I avoid to have multiple lists?

It depends on how you organize and generate the list-column. Basically, mutate(), transmute(), and summarise() will output a list-columns if they return a list.

In my sample code, map(~.x) function will give mutate() a list, map(~list(.x))will give mutate() lists inside a list. And you must be familiar with output of these functions to ensure the constructed data fits your request.

Thank you @yifanliu that was exactly what I was looking for !! :grinning:
I had a look at the map functions and I think that they are pretty much handy !!! I

I was trying to use the map_if function to do something more complex - I'd like to use the Result values per each month and iterate them with an external vector which should change according to a categorical variable. Thus I defined two different functions to be passed inside map_if but I am getting an error. The ideal would be also to create a new column in each dataframe of the list .. but I am not sure how to do that with "mutate" Would you mind to have a look at it as well? Thanks a lot :grinning:

rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#> Error: RStudio not running
getwd()
#> [1] "C:/Users/Angela/AppData/Local/Temp/RtmpmyO29m/reprex-121c6470d4-fresh-hare"

#load required packages 
library(mc2d)
#> Loading required package: mvtnorm
#> 
#> Attaching package: 'mc2d'
#> The following objects are masked from 'package:base':
#> 
#>     pmax, pmin
library(gplots)
#> 
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#> 
#>     lowess
library(RColorBrewer)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(reprex)
library(tidyverse)
set.seed(99)
iters<-1000

df<-data.frame(id=c(1:30),cat=c(rep("a",12),rep("b",18)),month=c(1:6,1,6,4,1,5,2,3,2,5,4,6,3:6,4:6,1:5,5),n=rpois(30,5))

df$n[df$n == "0"] <- 3
se<-rbeta(iters,96,6)
epi.a<-rpert(iters,min=1.5, mode=2, max=3)
p=0.2
p2=epi.a*p

df<-as_tibble(df)
# this defined function ensures any `n` from `df` will be itered with 10000 s and a and generated 10000 results
iter_n <- function(n) map2_dbl(.x = se, .y = p2, ~ 1 - (1 - .x * .y) ^ n)
list_1 <- df %>% mutate(Result = map(n, ~iter_n(.x))) %>% unnest(Result)%>% group_split(month)
list_1[[1]]
#> # A tibble: 4,000 x 5
#>       id cat   month     n Result
#>    <int> <chr> <dbl> <dbl>  <dbl>
#>  1     1 a         1     5  0.953
#>  2     1 a         1     5  0.927
#>  3     1 a         1     5  0.904
#>  4     1 a         1     5  0.945
#>  5     1 a         1     5  0.872
#>  6     1 a         1     5  0.840
#>  7     1 a         1     5  0.896
#>  8     1 a         1     5  0.944
#>  9     1 a         1     5  0.925
#> 10     1 a         1     5  0.937
#> # ... with 3,990 more rows

p3a=rbeta(iters,50,5)
p3b=rbeta(iters,40,6)

iter_n2a<-function(x) map2_dbl(.x=x$Result, .y=p3a,~ prod(1 - .x*.y))
iter_n2b<-function(x) map2_dbl(.x=x$Result, .y=p3b,~ prod(1 - .x*.y))

df<- df%>%map_if(any(x$cat=="a"),iter_n2a,
                 .else=iter_n2b)
#> Error in is_logical(.p): object 'x' not found

Created on 2022-05-06 by the reprex package (v2.0.1)

I'm glad to help.
First of all, I see some misuse of map functions here.
if you want to iterate one input a through another iterator b, you don't use map2, just use map(b, fun(b)) is enough. that is, rather than

iter_n2a<-function(x) map2_dbl(.x=x$Result, .y=p3a,~ prod(1 - .x*.y))
iter_n2b<-function(x) map2_dbl(.x=x$Result, .y=p3b,~ prod(1 - .x*.y))

you shall use

iter_n2a<-function(Result) map_dbl(p3a, ~ prod(1 - Result * .x))
iter_n2b<-function(Result) map_dbl(p3b, ~ prod(1 - Result * .x))

iter_n2b(Result = 0.953)
   [1] 0.17563131 0.19156393 0.12385743 0.19745062 0.13655432 0.22248734 0.23430213 0.17947047 0.12884342 0.11026780
  [11] 0.18387888 0.23159005 0.22353688 0.28381041 0.24876467 0.17686324 0.19072315 0.18973213 0.16937337 0.14828614
  [21] 0.15582794 0.14912805 0.14799621 0.14344385 0.29081321 0.14232417 0.16294083 0.33757760 0.14479506 0.12191899
  [31] 0.19440938 0.17250045 0.27338068 0.14655610 0.19021297 0.13943700 0.23221259 0.14165184 0.17178581 0.12928591
  [41] 0.22320997 0.13740555 0.14110546 0.19237594 0.28466046 0.12179116 0.12334132 0.10650499 0.13165283 0.12795890
  [51] 0.11198912 0.16168264 0.13668875 0.13176271 0.16758542 0.09255433 0.13907041 0.16740382 0.15140807 0.17921591
  [61] 0.14700704 0.14552124 0.18561961 0.14924755 0.30779619 0.15664556 0.11063055 0.09269329 0.11382821 0.18988574
...

the ~ .x stands for abbreviation of a function, if you are not familiar with this syntax, you can also write:

iter_n2b<-function(Result) map_dbl(p3b, function(p3b) prod(1 - Result * p3b))

then, the map_if detects the whether elements of a list (i.e., each column in a data.frame) should be iterated. I see here you want to apply distinct functions to some specific rows. here you should use if_else inside mutate as well:

list_2 <- list_1[[1]] %>% mutate(n_p = if_else(cat == "a",
                                               map(Result,  ~ iter_n2a(.x)),
                                               map(Result,  ~ iter_n2b(.x))))

list_2
# A tibble: 4,000 x 6
      id cat   month     n Result n_p          
   <int> <chr> <dbl> <dbl>  <dbl> <list>       
 1     1 a         1     8  0.968 <dbl [1,000]>
 2     1 a         1     8  0.976 <dbl [1,000]>
 3     1 a         1     8  0.969 <dbl [1,000]>
 4     1 a         1     8  0.967 <dbl [1,000]>
 5     1 a         1     8  0.961 <dbl [1,000]>
 6     1 a         1     8  0.983 <dbl [1,000]>
 7     1 a         1     8  0.977 <dbl [1,000]>
 8     1 a         1     8  0.961 <dbl [1,000]>
 9     1 a         1     8  0.953 <dbl [1,000]>
10     1 a         1     8  0.945 <dbl [1,000]>
# ... with 3,990 more rows

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.