Suppose I have the following data:
example_df <- data.frame(gender = c("M", "M", "F", "F"),
age = c("O","Y","O","Y"),
N = c(2,3,4,5),
y_hat = rnorm(4))
I want to create a data frame with two columns:
- The first should indicate the value of gender or age
- The second should be the postratification estimate defined by:
In this simple example I can do it by hand for each of the four values (M,F,O,Y), but I'm trying to figure out a programmatic way of doing this if you have more columns and the columns have more levels. Is there a tidy way to do this??? Right now this is how I'm doing it:
library(tidyr)
library(dplyr)
set.seed(7982)
example_df <- data.frame(gender = c("M", "M", "F", "F"),
age = c("O","Y","O","Y"),
N = c(2,3,4,5),
y_hat = rnorm(4))
get_y_hat_ps_for_colX_subgroupY <- function(colName, subgroup, data){
y_hat_ps <- data %>%
filter(!!as.name(colName)==subgroup) %>%
mutate(N_times_y_hat = N*y_hat) %>%
summarise(y_hat_M = sum(N_times_y_hat)/sum(N)) %>%
pull()
return(list(subgroup = subgroup, y_hat_ps = y_hat_ps))
}
get_y_hat_ps_for_colName <- function(colName, data){
subgroups <- example_df %>% select(!!as.name(colName)) %>% distinct() %>% pull() %>% as.character()
y_hat_ps <- purrr::map(.x = subgroups, .f = ~ get_y_hat_ps_for_colX_subgroupY(colName = colName, subgroup = .x, data = data))
return(dplyr::bind_rows(y_hat_ps))
}
get_y_hat_ps <- function(data, N, y_hat){
# browser()
N <- enquo(N)
y_hat <- enquo(y_hat)
colNames <- data %>% select(-!!N, -!!y_hat) %>% names()
df <- purrr::map(.x = colNames, .f = ~ get_y_hat_ps_for_colName(colName = .x, data = example_df)) %>%
dplyr::bind_rows()
return(df)
}
get_y_hat_ps(data = example_df, N = N, y_hat = y_hat)
# A tibble: 4 x 2
subgroup y_hat_ps
<chr> <dbl>
1 M 0.432
2 F 0.0179
3 O 0.365
4 Y 0.0167