Proportional Allocation Sampling using Dplyr package in R for big data frame

I have a big data set with 10525 rows with 44 different categories:


var   = c(rep("a",440), rep("b",255) ,rep("c",333),rep("d",47) ,rep("e",159),rep("f",67) ,rep("g",133),
          rep("h",342), rep("i",131) ,rep("j",606),rep("k",129),rep("l",126),rep("m",155),rep("n",62),
          rep("o",616), rep("p", 173),rep("q",430),rep("r",2)  ,rep("s",453),
          rep("t",154), rep("v",145),rep("u", 307),rep("w",233),rep("x",315),rep("y",65),rep("z",159),
          rep("aa",758),rep("ab",307),rep("ac", 413),rep("ad",184),rep("ae",334),rep("af",111),rep("ag",175),
          rep("ah", 262),rep("ai",309),rep("aj",71),rep("ak",35),rep("al",302),
          rep("am",266), rep("an",36),rep("ao",47),rep("ap",415),rep("aq",204),rep("ar",259))
value = rnorm(10525)
dat = tibble(var,value)

Now I want to make proportional allocation sampling i.e I want to sample exactly the number of subsamples as has been calculated in the table below in the column Ni for the corresponding category (group).

d1=dat%>%
  group_by(var)%>%
  summarise(N = n())

d2=dat%>%
  group_by(var)%>%
  summarise(w = n()/nrow(.))

A = left_join(d1,d2,by="var")%>%
+   mutate(Ni = round(N*w));A
# A tibble: 44 × 4
   var       N      w    Ni
   <chr> <int>  <dbl> <dbl>
 1 a       440 0.0418    18
 2 aa      758 0.0720    55
 3 ab      307 0.0292     9
 4 ac      413 0.0392    16
 5 ad      184 0.0175     3
 6 ae      334 0.0317    11
 7 af      111 0.0105     1
 8 ag      175 0.0166     3
 9 ah      262 0.0249     7
10 ai      309 0.0294     9
# … with 34 more rows

The theoretically correct total sample size must be: 354

sum(A$Ni)
[1] 354

2 questions:

1) How I can do that in R ?

2) How I can put a constrain that if the subsample (Ni) is 0 to take 1 observation ?

Any help? I would appreciate it .

My effort

a1 = dat %>% 
  left_join(A %>% mutate(w = n()/nrow(.), w = if_else(w <= 0.009, 1, w)) )%>%
  slice_sample(n = sum(A$Ni), weight_by = w)%>%
  select(c(var,Ni))%>%
  group_by(var)%>%
  summarise(n());a1
Joining, by = "var"
# A tibble: 41 × 2
   var   `n()`
   <chr> <int>
 1 a        17
 2 aa       21
 3 ab       11
 4 ac       11
 5 ad        7
 6 ae       10
 7 af        4
 8 ag        3
 9 ah       11
10 ai        4
# … with 31 more rows

but it should be all the 44 groups.
Or

A = left_join(d1,d2,by="var")%>%
  mutate(Ni = round(N*w))%>%
  mutate(x = replace(Ni,Ni==0,1));A
sum(A$x)
print(tibble(A),n=44)

a1 = dat %>% 
  left_join(A %>% mutate(w = n()/nrow(.), w = if_else(w <= 0.009, 1, w)) )%>%
  slice_sample(n = sum(A$x), weight_by = w)%>%
  select(c(var,Ni))%>%
  group_by(var)%>%
  summarise(n())
print(tibble(a1),n=44)

but again does not sample from all the groups.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.