Data wrangling for postratification ?

Suppose I have the following data:

example_df <- data.frame(gender = c("M", "M", "F", "F"),
                         age = c("O","Y","O","Y"),
                         N = c(2,3,4,5),
                         y_hat = rnorm(4))

I want to create a data frame with two columns:

  • The first should indicate the value of gender or age
  • The second should be the postratification estimate defined by:

In this simple example I can do it by hand for each of the four values (M,F,O,Y), but I'm trying to figure out a programmatic way of doing this if you have more columns and the columns have more levels. Is there a tidy way to do this??? Right now this is how I'm doing it:

library(tidyr)
library(dplyr)
set.seed(7982)

example_df <- data.frame(gender = c("M", "M", "F", "F"),
                         age = c("O","Y","O","Y"),
                         N = c(2,3,4,5),
                         y_hat = rnorm(4))


get_y_hat_ps_for_colX_subgroupY <- function(colName, subgroup, data){
  
  y_hat_ps <- data %>% 
    filter(!!as.name(colName)==subgroup) %>% 
    mutate(N_times_y_hat = N*y_hat) %>% 
    summarise(y_hat_M = sum(N_times_y_hat)/sum(N)) %>% 
    pull()
  return(list(subgroup = subgroup, y_hat_ps = y_hat_ps))
}

get_y_hat_ps_for_colName <- function(colName, data){
  subgroups <- example_df %>% select(!!as.name(colName)) %>% distinct() %>% pull() %>% as.character()
  y_hat_ps <- purrr::map(.x = subgroups, .f = ~ get_y_hat_ps_for_colX_subgroupY(colName = colName, subgroup = .x, data = data))
  return(dplyr::bind_rows(y_hat_ps))
}

get_y_hat_ps <- function(data, N, y_hat){
  # browser()
  N <- enquo(N)
  y_hat <- enquo(y_hat)
  colNames <- data %>% select(-!!N, -!!y_hat) %>% names()
  df <- purrr::map(.x = colNames, .f = ~ get_y_hat_ps_for_colName(colName = .x, data = example_df)) %>% 
    dplyr::bind_rows()
  return(df)
  
}

get_y_hat_ps(data = example_df, N = N, y_hat = y_hat)

# A tibble: 4 x 2
  subgroup y_hat_ps
  <chr>       <dbl>
1 M          0.432 
2 F          0.0179
3 O          0.365 
4 Y          0.0167

1 Like

I suspect there is a better way, but this seems to get the answer.

library(tidyr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
set.seed(7982)

example_df <- data.frame(gender = c("M", "M", "F", "F"),
                         age = c("O","Y","O","Y"),
                         N = c(2,3,4,5),
                         y_hat = rnorm(4))
dfGath <- example_df %>% gather(key = SubGrp, value =  Label, gender, age)
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
dfGath
#>   N      y_hat SubGrp Label
#> 1 2 -0.2137777 gender     M
#> 2 3  0.8628570 gender     M
#> 3 4  0.6539257 gender     F
#> 4 5 -0.4909430 gender     F
#> 5 2 -0.2137777    age     O
#> 6 3  0.8628570    age     Y
#> 7 4  0.6539257    age     O
#> 8 5 -0.4909430    age     Y

OutPut <- dfGath %>% mutate(Ny = N * y_hat) %>%   group_by(Label) %>%
  summarize(y_hat_ps = sum(Ny)/sum(N))
OutPut
#> # A tibble: 4 x 2
#>   Label y_hat_ps
#>   <chr>    <dbl>
#> 1 F       0.0179
#> 2 M       0.432 
#> 3 O       0.365 
#> 4 Y       0.0167

Created on 2019-03-21 by the reprex package (v0.2.1)

You can shorten it a bit by avoiding the intermediate mutate:

example_df %>% 
  gather(key, subgroup, gender, age) %>% 
  group_by(subgroup) %>% 
  summarise(y_hat_ps = sum(N*y_hat)/sum(N))
1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.