Pivoting Summary Tables in R

I am working with the R programming language.

I have the following dataset:

set.seed(123)
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)


status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status  <- as.factor(status )


country <- c("A", "B", "C", "D")
country <- sample(country, 5000, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
country  <- as.factor(country)



################

disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
disease <- as.factor(disease)

###################
my_data = data.frame(gender, status, disease, country)

I want to find out the relative percentage of each unique group of factors that have the disease vs do not have the disease.

As an example:

  • What percentage of Male Immigrants from Country A have the disease vs don't have the disease
  • What percentage of Male Citizens from Country A have the disease vs don't have the disease (both these percentages should add to 1)
  • etc.

I tried to do this with the following code:

library(dplyr)

step_1 = my_data %>% group_by (gender, status, country, disease) %>%
  summarise (n=n()) %>%
  mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%"))


`summarise()` has grouped output by 'gender', 'status', 'country'. You can override using the
`.groups` argument.
# A tibble: 32 x 6
# Groups:   gender, status, country [16]
   gender status    country disease     n rel.freq
   <fct>  <fct>     <fct>   <fct>   <int> <chr>   
 1 Female Citizen   A       No        285 60%     
 2 Female Citizen   A       Yes       193 40%  

Now (assuming this is correct), I am trying to make some modifications (i.e. pivot) to this result - this should reduce the number of rows by half (i.e. two rows in step_1 for yes and no are now combined into a single row):

step_2 = step_1 %>% 
  group_by(gender, status, country) %>% 
  summarize(disease = first(disease),
# number of people in this row that do not have the disease
            n_no = sum(disease == "No"),
# number of people in this row that do have the disease
            n_yes = sum(disease == "Yes"),
# relative percent of people in this row that do not  have the disease
            n_no_rel_freq = paste(round(sum(disease == "No") / sum(n) * 100), "%"),
# relative percent of people in this row that do have the disease
            n_yes_rel_freq = paste(round(sum(disease == "Yes") / sum(n) * 100), "%"),
# overall percent of all people in this row relative to entire population
            overall_percent = sum(n) / sum(step_1$n))

The code seems to run - but many of the percentages are now 0:

# A tibble: 16 x 9
# Groups:   gender, status [4]
   gender status    country disease  n_no n_yes n_no_rel_freq n_yes_rel_freq overall_percent
   <fct>  <fct>     <fct>   <fct>   <int> <int> <chr>         <chr>                    <dbl>
 1 Female Citizen   A       No          1     0 0 %           0 %      

Can someone please show me how to fix this?

Thanks!

Note: The final result should look something like this

    # desired result (sample)
  gender  status country n_no n_yes n_no_rel_freq n_yes_rel_freq total overall_percent
1 female citizen       A  285   193           0.6            0.4   478          0.0956

Your first step changes step_1 so that disease is always No and occurs once for every group. All of the subsequent sums are either 1 or 0.

 library(dplyr)
> step_2 = step_1 %>% 
+   group_by(gender, status, country) %>% 
+   summarize(disease = first(disease))
`summarise()` has grouped output by 'gender', 'status'. You can override
using the `.groups` argument.
> step_2
# A tibble: 16 × 4
# Groups:   gender, status [4]
   gender status    country disease
   <fct>  <fct>     <fct>   <fct>  
 1 Female Citizen   A       No     
 2 Female Citizen   B       No     
 3 Female Citizen   C       No     
 4 Female Citizen   D       No     
 5 Female Immigrant A       No     
 6 Female Immigrant B       No     
 7 Female Immigrant C       No     
 8 Female Immigrant D       No     
 9 Male   Citizen   A       No     
10 Male   Citizen   B       No     
11 Male   Citizen   C       No     
12 Male   Citizen   D       No     
13 Male   Immigrant A       No     
14 Male   Immigrant B       No     
15 Male   Immigrant C       No     
16 Male   Immigrant D       No 

Hi @omario,
I think you can employ pivot_wider() to achieve your desired outcome:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)

set.seed(123)
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)

status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status  <- as.factor(status )

country <- c("A", "B", "C", "D")
country <- sample(country, 5000, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
country  <- as.factor(country)

disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
disease <- as.factor(disease)

my_data <- data.frame(gender, status, disease, country)

head(my_data)
#>   gender    status disease country
#> 1 Female   Citizen     Yes       C
#> 2   Male Immigrant      No       C
#> 3 Female Immigrant     Yes       A
#> 4   Male   Citizen     Yes       C
#> 5   Male   Citizen      No       B
#> 6 Female   Citizen      No       C


step_1 <- my_data %>% group_by (gender, status, country, disease) %>%
  summarise(n=n()) %>%
  mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%"))
#> `summarise()` has grouped output by 'gender', 'status', 'country'. You can
#> override using the `.groups` argument.

head(step_1)
#> # A tibble: 6 × 6
#> # Groups:   gender, status, country [3]
#>   gender status  country disease     n rel.freq
#>   <fct>  <fct>   <fct>   <fct>   <int> <chr>   
#> 1 Female Citizen A       No        308 61%     
#> 2 Female Citizen A       Yes       200 39%     
#> 3 Female Citizen B       No        291 63%     
#> 4 Female Citizen B       Yes       169 37%     
#> 5 Female Citizen C       No        301 57%     
#> 6 Female Citizen C       Yes       228 43%


step_2 <- step_1 %>% 
  ungroup() %>% 
  pivot_wider(names_from = disease, values_from = c(n, rel.freq)) %>% 
  mutate(total = n_No + n_Yes,
         overall_percent = (total/sum(total)*100)) %>% 
  select(1:5, total, 6:7, overall_percent)

step_2
#> # A tibble: 16 × 9
#>    gender status    country  n_No n_Yes total rel.freq_No rel.freq_Yes overall…¹
#>    <fct>  <fct>     <fct>   <int> <int> <int> <chr>       <chr>            <dbl>
#>  1 Female Citizen   A         308   200   508 61%         39%              10.2 
#>  2 Female Citizen   B         291   169   460 63%         37%               9.2 
#>  3 Female Citizen   C         301   228   529 57%         43%              10.6 
#>  4 Female Citizen   D         245   189   434 56%         44%               8.68
#>  5 Female Immigrant A         107    95   202 53%         47%               4.04
#>  6 Female Immigrant B         126    76   202 62%         38%               4.04
#>  7 Female Immigrant C         137    70   207 66%         34%               4.14
#>  8 Female Immigrant D         129    74   203 64%         36%               4.06
#>  9 Male   Citizen   A         237   167   404 59%         41%               8.08
#> 10 Male   Citizen   B         247   163   410 60%         40%               8.2 
#> 11 Male   Citizen   C         250   171   421 59%         41%               8.42
#> 12 Male   Citizen   D         230   139   369 62%         38%               7.38
#> 13 Male   Immigrant A         103    68   171 60%         40%               3.42
#> 14 Male   Immigrant B         117    63   180 65%         35%               3.6 
#> 15 Male   Immigrant C          93    53   146 64%         36%               2.92
#> 16 Male   Immigrant D         102    52   154 66%         34%               3.08
#> # … with abbreviated variable name ¹​overall_percent

Created on 2022-12-26 with reprex v2.0.2

Hope this helps.

1 Like

Thank you so much for your answer!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.