Category-wise permutation test of independence with purrr map()?

qaz · September 28, 2022, 3:09am

Hello,

I am trying to conduct permutation test by category. In the example I am only able to do it for one category at a time.
Instead of manually doing it for each category by filtering how can I do so for all categories?
My end goal is to have a data frame with the category, the observed difference and p-values for each test.

Example:

# load packages
library('coin')
library('tidyverse')
library('purrr')
library('broom')

# set seed
set.seed(2000)

# generate data
d = tibble(value = rnorm(100),
           category = sample(1:5, replace = TRUE, 100),
           group = sample(c('ctrl', 'treat'), replace = TRUE, 100)) %>% 
    arrange(category)
# view data
d

# filter by category for category-wise permutation test of independence
d2 <- d %>% filter(category == 1)

# independence test using coin package
independence_test(value ~ as.factor(group), data = d2, distribution = approximate())

Output:

> # load packages
> library('coin')
> library('tidyverse')
> library('purrr')
> library('broom')
> 
> # set seed
> set.seed(2000)
> 
> # generate data
> d = tibble(value = rnorm(100),
+            category = sample(1:5, replace = TRUE, 100),
+            group = sample(c('ctrl', 'treat'), replace = TRUE, 100)) %>% 
+     arrange(category)
> # view data
> d
# A tibble: 100 × 3
    value category group
    <dbl>    <int> <chr>
 1 -0.353        1 treat
 2  0.916        1 ctrl 
 3 -0.192        1 ctrl 
 4 -1.17         1 treat
 5 -0.207        1 ctrl 
 6  0.374        1 treat
 7 -0.404        1 ctrl 
 8  1.55         1 treat
 9 -1.18         1 treat
10 -0.277        1 treat
# … with 90 more rows
# ℹ Use `print(n = ...)` to see more rows
> 
> # filter by category for category-wise permutation test of independence
> d2 <- d %>% filter(category == 1)
> 
> # independence test using coin package
> independence_test(value ~ as.factor(group), data = d2, distribution = approximate())

	Approximative General Independence Test

data:  value by as.factor(group) (ctrl, treat)
Z = -0.44373, p-value = 0.6768
alternative hypothesis: two.sided

At the moment I have some notion that I should be nesting the data by category and then mapping the coin::independence_test with purr:

# nest data by category
d2 <- d %>%
    group_by(category) %>% 
    nest() 

d2

Output

> d2
# A tibble: 5 × 2
# Groups:   category [5]
  category data             
     <int> <list>           
1        1 <tibble [16 × 2]>
2        2 <tibble [26 × 2]>
3        3 <tibble [22 × 2]>
4        4 <tibble [15 × 2]>
5        5 <tibble [21 × 2]>

However, I am having trouble with mapping the inputs required by coin::independence_test to map()

I would appreciate any help and insights into what I'm missing.

Edit: forgot to mention I am performing a t.test on the same data as below:

# Welch t.test
d3 <- d %>%
    group_by(category, group) %>% 
    nest() %>% 
    pivot_wider(names_from = group, values_from = data) %>% 
    # depreciated: spread(key = treatment, value = data) %>% 
    mutate(
        t_test = map2(ctrl, treat, ~{t.test(.x$value, .y$value) %>% tidy()}),
        ctrl = map(ctrl, nrow),
        treat = map(treat, nrow)
    ) %>% 
    unnest(cols = c(ctrl, treat, t_test))

d3

Ouput

> # Welch t.test
> d3 <- d %>%
+     group_by(category, group) %>% 
+     nest() %>% 
+     pivot_wider(names_from = group, values_from = data) %>% 
+     # depreciated: spread(key = treatment, value = data) %>% 
+     mutate(
+         t_test = map2(ctrl, treat, ~{t.test(.x$value, .y$value) %>% tidy()}),
+         ctrl = map(ctrl, nrow),
+         treat = map(treat, nrow)
+     ) %>% 
+     unnest(cols = c(ctrl, treat, t_test))
> 
> d3
# A tibble: 5 × 13
# Groups:   category [5]
  category treat  ctrl estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method              alter…¹
     <int> <int> <int>    <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl> <chr>               <chr>  
1        1    10     6   -0.169   -0.193    -0.0245    -0.475   0.642     13.6    -0.934     0.596 Welch Two Sample t… two.si…
2        2    13    13    0.272    0.682     0.409      0.873   0.392     22.9    -0.374     0.918 Welch Two Sample t… two.si…
3        3    11    11   -0.478   -0.568    -0.0895    -1.22    0.238     20.0    -1.30      0.342 Welch Two Sample t… two.si…
4        4     4    11   -0.922   -0.359     0.563     -1.23    0.282      4.16   -2.97      1.12  Welch Two Sample t… two.si…
5        5    15     6    0.281    0.0919   -0.190      0.696   0.498     14.0    -0.585     1.15  Welch Two Sample t… two.si…
# … with abbreviated variable name ¹alternative

EDIT:

Editing this post to add in my eventual solution (thank you to everyone for the discussion in the answers) as the replies have been locked:

d3 <- d %>%
        group_by(category) %>%
        nest() %>% 
        mutate(
                perm_test = map(.x = data, .f = ~pvalue(independence_test(.x$value ~ as.factor(.x$group), data = data)))
        ) %>% 
        unnest(cols = c(category, perm_test))


View(d3)

nirgrahamuk · September 28, 2022, 9:06am

to do 5 times, rather than once for 1 you can do

map(1:5, ~ independence_test(value ~ as.factor(group),
                  data = d %>% filter(category == .x),
                  distribution = approximate()))

the category selection by filtering is the changing part, so this is substituted in the map from the 1:5 inputs via the .x shortcut.

system · October 5, 2022, 9:07am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.