Iteration through list of dataframes (lapply / map)

Javier9 · October 20, 2022, 3:17pm

First of all I am going to paste the class of my dfs before getting

class(group_1)
[1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

I am going to paste 3 pieces of code of my datasets, you have to know that I UNGROUPED the data

g1 <- structure(list(id = c(110104019, 120715032, 120715020, 50203029, 
70111022, 140102087, 120715020, 120715033, 140102088, 110113007, 
120715029, 111201026, 110110005, 120715026, 110104028, 140103029, 
110110005, 50527001, 111201026, 130108008), sexo = c(1, 1, 1, 
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1), grup_int = c(1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gen = c("pcsk9", 
"abca1", "s18", "il10", "il8ra", "ccl3", "il10", "s18", "nr1h2", 
"cyp27a1", "lag3", "ppard", "lag3", "ptgs1", "nr1h3", "abcg1", 
"cyp27a1", "scarb1", "vcam1", "mcp1"), time = c("1", "1", "3", 
"3", "3", "3", "3", "3", "1", "1", "3", "3", "3", "3", "3", "3", 
"3", "1", "3", "3"), Ct = c(NA, 17.781, 10.964, 24.152, 16.463, 
22.747, 23.815, 10.809, 19.229, 18.642, 18.803, 16.583, 21.288, 
16.209, 21.11, 19.298, 17.278, 20.491, 23.792, 23.226), dCt = c(NA, 
5.907, -1.151, 12.796, 3.965, 10.522, 11.7, -1.544, 5.867, 5.97, 
7.252, 5.122, 9.234, 5.412, 9.244, 7.361, 5.224, 8.201, 12.331, 
11.859), RQ = c(NA, 2.356, 0.936, 1.244, 0.464, 1.28, 1.537, 
1.256, 0.486, 1.159, 2.176, 1.244, 1.462, 1.486, 1.344, 0.662, 
1.865, 1.376, 2.358, 1.035)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

g2 <- structure(list(id = c(110104019, 120715032, 120715020, 50203029, 
70111022, 140102087, 120715020, 120715033, 140102088, 110113007, 
120715029, 111201026, 110110005, 120715026, 110104028, 140103029, 
110110005, 50527001, 111201026, 130108008), sexo = c(1, 1, 1, 
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1), grup_int = c(1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gen = c("pcsk9", 
"abca1", "s18", "il10", "il8ra", "ccl3", "il10", "s18", "nr1h2", 
"cyp27a1", "lag3", "ppard", "lag3", "ptgs1", "nr1h3", "abcg1", 
"cyp27a1", "scarb1", "vcam1", "mcp1"), time = c("1", "1", "3", 
"3", "3", "3", "3", "3", "1", "1", "3", "3", "3", "3", "3", "3", 
"3", "1", "3", "3"), Ct = c(NA, 17.781, 10.964, 24.152, 16.463, 
22.747, 23.815, 10.809, 19.229, 18.642, 18.803, 16.583, 21.288, 
16.209, 21.11, 19.298, 17.278, 20.491, 23.792, 23.226), dCt = c(NA, 
5.907, -1.151, 12.796, 3.965, 10.522, 11.7, -1.544, 5.867, 5.97, 
7.252, 5.122, 9.234, 5.412, 9.244, 7.361, 5.224, 8.201, 12.331, 
11.859), RQ = c(NA, 2.356, 0.936, 1.244, 0.464, 1.28, 1.537, 
1.256, 0.486, 1.159, 2.176, 1.244, 1.462, 1.486, 1.344, 0.662, 
1.865, 1.376, 2.358, 1.035)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))
> g2 <- group_2 %>% dplyr::ungroup() %>% dplyr::select(id, sexo, grup_int, gen, time, Ct, dCt, RQ) %>%  slice_sample(n = 20, replace =F) %>% ungroup() %>% dput()
structure(list(id = c(110606056, 110104017, 60901024, 110113008, 
110113008, 140103036, 110104017, 50705001, 110104027, 50109026, 
110104029, 110104024, 50203022, 110104017, 140103035, 140103035, 
130106037, 130102010, 50203022, 110606056), sexo = c(1, 1, 0, 
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1), grup_int = c(2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), gen = c("ldlr", 
"tnf", "pparg", "ptgs1", "cxcl8_il8_", "mcp1", "il1b", "ptgs1", 
"cyp27a1", "ccl3", "cxcl8_il8_", "ccl3", "gapd", "cd40l", "lag3", 
"ptgs2", "pcsk9", "ccl3", "slc2a3_glut3_", "nr1h3"), time = c("3", 
"1", "3", "1", "3", "1", "3", "1", "1", "1", "1", "1", "3", "3", 
"3", "1", "3", "1", "1", "1"), Ct = c(18.842, 18.926, 23.787, 
18.165, 15.629, 24.504, 16.084, 17.259, 19.718, 22.88, 15.522, 
21.688, 16.776, 17.413, 18.828, 17.398, NA, 20.422, 15.503, 20.641
), dCt = c(6.749, 6.537, 12.334, 6.027, 2.92, NA, 3.973, 5.619, 
5.988, 10.377, 4.101, 8.885, 4.207, 5.302, 7.953, 5.974, NA, 
8.827, 2.776, NA), RQ = c(NA, 1.188, 0.7, 2.065, 1.589, NA, 1.597, 
1.6, 3.858, 0.99, 0.845, 2, 1.293, 1.135, 2.542, 1.067, NA, 1.218, 
2.86, NA)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
"data.frame"))

g3 <- structure(list(id = c(130106034, 60901027, 60901035, 130106034, 
50203013, 50430001, 50109025, 50203012, 50203006, 60901027, 130106034, 
50203019, 50203014, 120715012, 50109019, 140103019, 140102090, 
140103019, 110104023, 60901031), sexo = c(1, 1, 0, 1, 0, 0, 1, 
1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1), grup_int = c(3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), gen = c("nr1h2", 
"cd86", "cd86", "adrb2", "il8ra", "ldlr", "mcp1", "chuk", "olr1", 
"nampt", "s1pr3", "ido", "lrp1", "cxcl2", "ptgs2", "cxcl2", "ppard", 
"gapd", "ptgs1", "pcsk9"), time = c("1", "1", "1", "1", "3", 
"3", "1", "1", "1", "1", "1", "1", "3", "3", "3", "3", "3", "1", 
"3", "1"), Ct = c(18.987, 18.701, 17.983, 17.064, 14.62, 19.71, 
24.713, 17.419, 25.328, 12.745, 21.264, 19.5, 17.102, 22.794, 
18.162, 23.335, 17.332, 15.329, 17.895, NA), dCt = c(5.995, 6.199, 
5.644, 4.072, 3.298, 6.907, 12.762, 5.607, 12.525, 0.243, 8.272, 
7.451, 5.007, 11.107, 5.3, 11.513, 3.353, 4.113, 5.42, NA), RQ = c(0.858, 
1.931, 1.235, 0.885, 1.013, 1.52, 1.05, 1.263, 1.538, 0.822, 
0.122, 1.209, 2.202, 1.321, 1.277, 2.002, 4.713, 1.198, 6.309, 
NA)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"
))

Intermediate steps with question vital for the next steps ( i am going to go for 1st approach). Creating the list of dfs

# 1st approach
list_1 <- list(g1, g2, g3)

# 2nd - vector
g_all <- c(g1, g2, g3)

Now what I want to achieve, this would be for an individual dataframe

descriptive_g3 <-lapply(split(g3, group_3$time), function(x) {
    x %>% 
    group_by(gen) %>% 
    get_summary_stats(c(dCt,RQ), show = c("mean", "sd", "median", "iqr", "min", "max")) 
  })

I would like to pass it to the list of dfs (grouping by gen). I've tried

#lapply
lapply(list_1, function(df) df %>% group_by(gen) %>% 
    get_summary_stats(c(dCt,RQ), show = c("mean", "sd", "median", "iqr", "min", "max")))

#or
lapply(groups, function(x) aggregate(gene~dCt, x, c( "mean", "sd", "median", "iqr", "min", "max")))


#with map

list_1  %>% 
map(., ~ group_by(., gen)) %>% map(., ~get_summary_stats(c(.$dCt,.$RQ), show = c("mean", "sd", "median", "iqr", "min", "max")))

I expect to generate tibbles or dfs, so if the final result is going to be list with no access it would be hard for me to export them

Thank you for help!

FJCC · October 20, 2022, 4:53pm

I am not sure what you want for the final output. Here are two versions. The first produces a list of six data frame that are summaries of the original three data frames at the two time values of 1 and 3. The second version places those six data frame in the global environment.

library(dplyr)
library(rstatix)
library(purrr)
#Make a data frame with no results in hour 19
g1 <- structure(list(id = c(110104019, 120715032, 120715020, 50203029, 
                            70111022, 140102087, 120715020, 120715033, 140102088, 110113007, 
                            120715029, 111201026, 110110005, 120715026, 110104028, 140103029, 
                            110110005, 50527001, 111201026, 130108008), 
                     sexo = c(1, 1, 1,0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1), 
                     grup_int = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
                     gen = c("pcsk9", "abca1", "s18", "il10", "il8ra", "ccl3", "il10", "s18", "nr1h2", 
                                      "cyp27a1", "lag3", "ppard", "lag3", "ptgs1", "nr1h3", "abcg1", 
                                      "cyp27a1", "scarb1", "vcam1", "mcp1"), 
                     time = c("1", "1", "3", "3", "3", "3", "3", "3", "1", "1", "3", "3", "3", "3", "3", "3", 
                              "3", "1", "3", "3"), 
                     Ct = c(NA, 17.781, 10.964, 24.152, 16.463, 22.747, 23.815, 10.809, 19.229, 18.642, 
                            18.803, 16.583, 21.288, 16.209, 21.11, 19.298, 17.278, 20.491, 23.792, 23.226), 
                     dCt = c(NA, 5.907, -1.151, 12.796, 3.965, 10.522, 11.7, -1.544, 5.867, 5.97, 
                             7.252, 5.122, 9.234, 5.412, 9.244, 7.361, 5.224, 8.201, 12.331, 
                             11.859), 
                     RQ = c(NA, 2.356, 0.936, 1.244, 0.464, 1.28, 1.537, 1.256, 0.486, 1.159, 2.176, 1.244, 
                            1.462, 1.486, 1.344, 0.662, 1.865, 1.376, 2.358, 1.035)), 
                row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
g2 <- structure(list(id = c(110606056, 110104017, 60901024, 110113008, 
                            110113008, 140103036, 110104017, 50705001, 110104027, 50109026, 
                            110104029, 110104024, 50203022, 110104017, 140103035, 140103035, 
                            130106037, 130102010, 50203022, 110606056), 
                     sexo = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1), 
                     grup_int = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
                     gen = c("ldlr", "tnf", "pparg", "ptgs1", "cxcl8_il8_", "mcp1", "il1b", "ptgs1", 
                             "cyp27a1", "ccl3", "cxcl8_il8_", "ccl3", "gapd", "cd40l", "lag3", 
                             "ptgs2", "pcsk9", "ccl3", "slc2a3_glut3_", "nr1h3"), 
                     time = c("3", "1", "3", "1", "3", "1", "3", "1", "1", "1", "1", "1", "3", "3", 
                              "3", "1", "3", "1", "1", "1"), 
                     Ct = c(18.842, 18.926, 23.787, 18.165, 15.629, 24.504, 16.084, 17.259, 19.718, 22.88, 15.522, 
                            21.688, 16.776, 17.413, 18.828, 17.398, NA, 20.422, 15.503, 20.641), 
                     dCt = c(6.749, 6.537, 12.334, 6.027, 2.92, NA, 3.973, 5.619, 5.988, 10.377, 4.101, 8.885, 4.207, 
                             5.302, 7.953, 5.974, NA, 8.827, 2.776, NA), 
                     RQ = c(NA, 1.188, 0.7, 2.065, 1.589, NA, 1.597, 1.6, 3.858, 0.99, 0.845, 2, 1.293, 1.135, 
                            2.542, 1.067, NA, 1.218, 
                            2.86, NA)), 
                row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
g3 <- structure(list(id = c(130106034, 60901027, 60901035, 130106034, 
                            50203013, 50430001, 50109025, 50203012, 50203006, 60901027, 130106034, 
                            50203019, 50203014, 120715012, 50109019, 140103019, 140102090, 
                            140103019, 110104023, 60901031), 
                     sexo = c(1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1), 
                     grup_int = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), 
                     gen = c("nr1h2", "cd86", "cd86", "adrb2", "il8ra", "ldlr", "mcp1", "chuk", "olr1",
                             "nampt", "s1pr3", "ido", "lrp1", "cxcl2", "ptgs2", "cxcl2", "ppard", 
                             "gapd", "ptgs1", "pcsk9"), 
                     time = c("1", "1", "1", "1", "3", "3", "1", "1", "1", "1", "1", "1", "3", "3", "3", 
                              "3", "3", "1", "3", "1"), 
                     Ct = c(18.987, 18.701, 17.983, 17.064, 14.62, 19.71, 24.713, 17.419, 25.328, 12.745, 
                            21.264, 19.5, 17.102, 22.794, 18.162, 23.335, 17.332, 15.329, 17.895, NA), 
                     dCt = c(5.995, 6.199, 5.644, 4.072, 3.298, 6.907, 12.762, 5.607, 12.525, 0.243, 8.272, 
                             7.451, 5.007, 11.107, 5.3, 11.513, 3.353, 4.113, 5.42, NA), 
                     RQ = c(0.858, 1.931, 1.235, 0.885, 1.013, 1.52, 1.05, 1.263, 1.538, 0.822, 
                            0.122, 1.209, 2.202, 1.321, 1.277, 2.002, 4.713, 1.198, 6.309, 
                            NA)), 
                row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
list_1 <- list(g1, g2, g3)
names(list_1) <- c("g1","g2","g3")


SummaryFunc <- function(L) {
  tmp <- bind_rows(L, .id = "DF") |> 
  group_by(DF, time, gen) |> 
    get_summary_stats(c(dCt,RQ), show = c("mean", "sd", "median", "iqr", "min", "max")) 
  split(tmp, list(tmp$DF, tmp$time))
}
      
OUT <- SummaryFunc(list_1) 
names(OUT)
#> [1] "g1.1" "g2.1" "g3.1" "g1.3" "g2.3" "g3.3"

SummaryFunc2 <- function(L) {
  tmp <- bind_rows(L, .id = "DF") |> 
    group_by(DF, time, gen) |> 
    get_summary_stats(c(dCt,RQ), show = c("mean", "sd", "median", "iqr", "min", "max")) 
  tmp2 <- split(tmp, list(tmp$DF, tmp$time))
  DF_names <- names(tmp2)
  walk(DF_names, ~assign(.x,tmp2[[.x]], envir = .GlobalEnv))
}

SummaryFunc2(list_1)

^{Created on 2022-10-20 with reprex v2.0.2}

Javier9 · October 21, 2022, 9:09am

FJCC:

     
OUT <- SummaryFunc(list_1) 
names(OUT)
#> [1] "g1.1" "g2.1" "g3.1" "g1.3" "g2.3" "g3.3"

SummaryFunc2 <- function(L) {
  tmp <- bind_rows(L, .id = "DF") |> 
    group_by(DF, time, gen) |> 
    get_summary_stats(c(dCt,RQ), show = c("mean", "sd", "median", "iqr", "min", "max")) 
  tmp2 <- split(tmp, list(tmp$DF, tmp$time))
  DF_names <- names(tmp2)
  walk(DF_names, ~assign(.x,tmp2[[.x]], envir = .GlobalEnv))
}

SummaryFunc2(list_1)

I was looking for something like this. Thank you

system · October 28, 2022, 9:09am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.