Calculating Conditional Probabilities in DPLYR

omario · December 18, 2022, 6:25pm

I have the following dataset - this data represents students (e.g. id = 1, id = 2, id = 3) who took an exam at different dates, and the result that they got (0 = pass, 1 = fail).


library(data.table)

  my_data = data.table( structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), results = c(0, 
0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 
1), date_exam_taken = structure(c(12889, 12943, 15445, 15528, 
17840, 10623, 10680, 11186, 11971, 12826, 13744, 13805, 14904, 
15089, 15815, 16883, 17511, 17673, 11500, 12743, 14906, 15675, 
16774), class = "Date"), exam_number = c(1L, 2L, 3L, 4L, 5L, 
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 
3L, 4L, 5L)), row.names = c(NA, 23L), class = "data.frame"))

> head(my_data)
   id results date_exam_taken exam_number
1:  1       0      2005-04-16           1
2:  1       0      2005-06-09           2
3:  1       1      2012-04-15           3
4:  1       1      2012-07-07           4
5:  1       1      2018-11-05           5
6:  2       0      1999-02-01           1

Using the following code in R, I was able to count the number of "3 exam transitions" - that is, I was able to count the number of times each student experienced :

"pass, pass, pass"
"pass, pass, fail"
etc
"fail, fail, fail"

The R code looks something like this:

my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)

# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]

out = na.omit(out)

> head(out)
    id current_exam prev_exam prev_2_exam tally
 1:  1            1         0           0     1
 2:  1            1         1           0     1
 3:  1            1         1           1     1
 4:  2            0         1           1     3

Now, I want to calculate the probability of the student pass/failing the current exam, conditional on the results of the previous exam and the second previous exam.

I thought the best way to do this was to first perform an aggregation:

library(dplyr)
agg = out %>% group_by(current_exam, prev_exam, prev_2_exam) %>% summarise(total = sum(tally))

> agg
# A tibble: 6 x 4
# Groups:   current_exam, prev_exam [3]
  current_exam prev_exam prev_2_exam total
         <dbl>     <dbl>       <dbl> <int>
1            0         1           0     1
2            0         1           1     4
3            1         0           0     1
4            1         0           1     5
5            1         1           0     4
6            1         1           1     6

From here, I am trying to look for an efficient way to calculate all conditional probabilities (i.e. P(current exam = 0 | prev_exam = 0 & prev_2_exam = 0)).

I figured out how to do this manually:

# prob (current = 1, given  prev = 1, 2nd_prev =1
p1 = agg[ agg$current_exam == 1 & agg$prev_exam == 1 & agg$prev_2_exam == 1,]
p2 = agg[ agg$current_exam == 0 & agg$prev_exam == 1 & agg$prev_2_exam == 1,]

final_prob_1_1_1 = sum(p1$total)/(sum(p1$total) + sum(p2$total))

But is there some easier way to do this? Is there some DPLYR function that can "look back" and count all combinations until the second last column and calculate all the conditional probabilities?

In the end - I am looking to get an output with 8 rows that looks something like this:

 second_prev_prev      current_exam          probs
                11            1              prob1
                11            0              prob2
                10            1              prob3
                10            0              prob4
                01            1              prob5
                01            0              prob6
                00            1              prob7
                00            0              prob8

Thanks!

Note: My attempt - is this correct?

# my own attempt
> agg %>%
     group_by(prev_exam, prev_2_exam) %>%
     mutate(probability = total / sum(total))
# A tibble: 6 x 5
# Groups:   prev_exam, prev_2_exam [4]
  current_exam prev_exam prev_2_exam total probability
         <dbl>     <dbl>       <dbl> <int>       <dbl>
1            0         1           0     1         0.2
2            0         1           1     4         0.4
3            1         0           0     1         1  
4            1         0           1     5         1  
5            1         1           0     4         0.8
6            1         1           1     6         0.6a

DavoWW · December 20, 2022, 5:20am

Hi @omario,
You were 95% there with your code. Try this:

suppressPackageStartupMessages(library(tidyverse))
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose

my_data <- data.table( structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), results = c(0, 
0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 
1), date_exam_taken = structure(c(12889, 12943, 15445, 15528, 
17840, 10623, 10680, 11186, 11971, 12826, 13744, 13805, 14904, 
15089, 15815, 16883, 17511, 17673, 11500, 12743, 14906, 15675, 
16774), class = "Date"), exam_number = c(1L, 2L, 3L, 4L, 5L, 
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 
3L, 4L, 5L)), row.names = c(NA, 23L), class = "data.frame"))

my_data
#>     id results date_exam_taken exam_number
#>  1:  1       0      2005-04-16           1
#>  2:  1       0      2005-06-09           2
#>  3:  1       1      2012-04-15           3
#>  4:  1       1      2012-07-07           4
#>  5:  1       1      2018-11-05           5
#>  6:  2       0      1999-02-01           1
#>  7:  2       1      1999-03-30           2
#>  8:  2       1      2000-08-17           3
#>  9:  2       1      2002-10-11           4
#> 10:  2       1      2005-02-12           5
#> 11:  2       0      2007-08-19           6
#> 12:  2       1      2007-10-19           7
#> 13:  2       0      2010-10-22           8
#> 14:  2       1      2011-04-25           9
#> 15:  2       1      2013-04-20          10
#> 16:  2       0      2016-03-23          11
#> 17:  2       1      2017-12-11          12
#> 18:  2       1      2018-05-22          13
#> 19:  3       1      2001-06-27           1
#> 20:  3       1      2004-11-21           2
#> 21:  3       1      2010-10-24           3
#> 22:  3       0      2012-12-01           4
#> 23:  3       1      2015-12-05           5
#>     id results date_exam_taken exam_number

my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)

# Check exam frequencies
with(my_data, table(id, current_exam))
#>    current_exam
#> id  0 1
#>   1 2 3
#>   2 4 9
#>   3 1 4

# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]
out <- na.omit(out)
out
#>     id current_exam prev_exam prev_2_exam tally
#>  1:  1            1         0           0     1
#>  2:  1            1         1           0     1
#>  3:  1            1         1           1     1
#>  4:  2            0         1           1     3
#>  5:  2            1         0           1     4
#>  6:  2            1         1           0     3
#>  7:  2            1         1           1     2
#>  8:  2            0         1           0     1
#>  9:  3            1         1           1     3
#> 10:  3            0         1           1     1
#> 11:  3            1         0           1     1

# Summarise by 3-result combinations (only 6 out of possible 8 are represented in data)
agg <- out %>% 
  group_by(current_exam, prev_exam, prev_2_exam) %>% 
  summarise(total = sum(tally))
#> `summarise()` has grouped output by 'current_exam', 'prev_exam'. You can
#> override using the `.groups` argument.
agg
#> # A tibble: 6 × 4
#> # Groups:   current_exam, prev_exam [3]
#>   current_exam prev_exam prev_2_exam total
#>          <dbl>     <dbl>       <dbl> <int>
#> 1            0         1           0     1
#> 2            0         1           1     4
#> 3            1         0           0     1
#> 4            1         0           1     5
#> 5            1         1           0     4
#> 6            1         1           1     6

# Create combined 3-result factor
agg %>% 
  mutate(combined = as.factor(paste0(prev_2_exam, prev_exam, current_exam))) -> agg

# All 8 combinations are needed in the data for this to work 
# completely (hence some NAs in output)
agg %>%
  select(prev_2_exam, prev_exam, current_exam, total, combined) %>% 
  arrange(desc(prev_2_exam), desc(prev_exam), desc(current_exam)) %>% 
  group_by(prev_2_exam, prev_exam) %>%
  mutate(probability = ifelse(current_exam == 1, 
                              total[1]/(total[1] + total[2]),
                              total[2]/(total[1] + total[2])))
#> # A tibble: 6 × 6
#> # Groups:   prev_2_exam, prev_exam [4]
#>   prev_2_exam prev_exam current_exam total combined probability
#>         <dbl>     <dbl>        <dbl> <int> <fct>          <dbl>
#> 1           1         1            1     6 111              0.6
#> 2           1         1            0     4 110              0.4
#> 3           1         0            1     5 101             NA  
#> 4           0         1            1     4 011              0.8
#> 5           0         1            0     1 010              0.2
#> 6           0         0            1     1 001             NA

^{Created on 2022-12-20 with reprex v2.0.2}

Hope this helps.

omario · December 20, 2022, 6:49pm

Thank you so much for your answer! I wonder if your code can easily be applied to datasets with more columns?

system · January 31, 2023, 6:50pm

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.