Shortcuts for Calculating Percentiles

I have the following dataset:

set.seed(123)
library(dplyr)

var1 = rnorm(10000, 100,100)
var2 = rnorm(10000, 100,100)
var3 = rnorm(10000, 100,100)
var4 = rnorm(10000, 100,100)
var5 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))
var6 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))

my_data = data.frame( var1, var2, var3, var4, var5, var6)

I want to calculate "grouped percentiles" (e.g. at arbitrary levels) for different columns in this dataset according to the categorical variables.

** I am trying to do this "manually" in the meantime.**

For instance, suppose:

  • Based on groupings of var5 and var6
  • I want to create a variable "class3" that splits var3 into groups of 10 percentiles
  • And I want to create a variable "class4" that splits var4 into groups of 20 percentiles

As an example, here are two different ways I am trying to do this:

Method 1: Produces some NA's?

library(dplyr)
final = my_data %>% group_by(var5, var6) %>%
  mutate(class3 = case_when(ntile(var3, 10) == 1 ~ paste0(round(min(var3), 2), " to ", round(quantile(var3, 0.1), 2), " decile 1"),
                            ntile(var3, 10) == 2 ~ paste0(round(quantile(var3, 0.1), 2), " to ", round(quantile(var3, 0.2), 2), " decile 2"),
                            ntile(var3, 10) == 3 ~ paste0(round(quantile(var3, 0.2), 2), " to ", round(quantile(var3, 0.3), 2), " decile 3"),
                            ntile(var3, 10) == 4 ~ paste0(round(quantile(var3, 0.3), 2), " to ", round(quantile(var3, 0.4), 2), " decile 4"),
                            ntile(var3, 10) == 5 ~ paste0(round(quantile(var3, 0.4), 2), " to ", round(quantile(var3, 0.5), 2), " decile 5"),
                            ntile(var3, 10) == 6 ~ paste0(round(quantile(var3, 0.5), 2), " to ", round(quantile(var3, 0.6), 2), " decile 6"),
                            ntile(var3, 10) == 7 ~ paste0(round(quantile(var3, 0.6), 2), " to ", round(quantile(var3, 0.7), 2), " decile 7"),
                            ntile(var3, 10) == 8 ~ paste0(round(quantile(var3, 0.7), 2), " to ", round(quantile(var3, 0.8), 2), " decile 8"),
                            ntile(var3, 10) == 9 ~ paste0(round(quantile(var3, 0.8), 2), " to ", round(quantile(var3, 0.9), 2), " decile 9"),
                            ntile(var3, 10) == 10 ~ paste0(round(quantile(var3, 0.9), 2), " to ", round(max(var3), 2), " decile 10"))) %>%

  mutate(class4 = case_when(ntile(var4, 20) == 1 ~ paste0(round(min(var4), 2), " to ", round(quantile(var4, 0.1), 2), " pcile 1"),
                            ntile(var4, 20) == 2 ~ paste0(round(quantile(var4, 0.1), 2), " to ", round(quantile(var4, 0.2), 2), " pcile 2"),
                            ntile(var4, 20) == 3 ~ paste0(round(quantile(var4, 0.2), 2), " to ", round(quantile(var4, 0.3), 2), " pcile 3"),
                            ntile(var4, 20) == 4 ~ paste0(round(quantile(var4, 0.3), 2), " to ", round(quantile(var4, 0.4), 2), " pcile 4"),
                            ntile(var4, 20) == 5 ~ paste0(round(quantile(var4, 0.4), 2), " to ", round(quantile(var4, 0.5), 2), " pcile 5")))

Method 2: Fewer NA's?

final = my_data %>% group_by(var5, var6) %>%  mutate(class3 = paste0(cut(var3, breaks = c(-Inf, quantile(var3, c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)), Inf), 
                                                                 labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5", "ptile 6", "ptile 7", "ptile 8", "ptile 9", "ptile 10")),
                                                             " (", round(min(var3), 2), " to ", round(max(var3), 2), ")")) %>% 
    mutate(class4 = paste0(cut(var4, breaks = c(-Inf, quantile(var4, c(0.2, 0.4, 0.6, 0.8)), Inf), 
                               labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5")),
                           " (", round(min(var4), 2), " to ", round(max(var4), 2), ")"))

I think that Method 2 might be more correct, seeing as there are fewer NA values being produced - but in the meantime, could someone please help me verify if I am doing this correctly (in Method 2) ... and if not, how can I correct this?

Thanks!

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.