Using predict() after acquiring lm() on nest() groups. A 'Group_by' challenge

(question edit for brevity)

Hello all,

I have 2 data sets - I want to predict the Final_Scaled score in the CurrentAssessment data, the challenge is grouping the coefficient values to the correct Course.

TISCMaster - historical data, mapping student performance and outcomes after scaling. Each student can have up to 6 rows of data depending how many Courses they have taken.

CurrentAssessmentData - Current student performance before scaling and exam.

TiscMaster <- data.frame(stringsAsFactors = FALSE,
  StudentNumber = c(36546981,
                    36546981,36546981,
                    36553669,36553669,
                    36553669,36553669,
                    36566335,36566335,
                    36566335,36566335,
                    36566335,36587418,
                    36587418,36587418,
                    36587418,36590558,
                    36590558,36600083,
                    36600083,36600083,
                    36600083,36601371,
                    36601371,36601371,
                    36601371,36601371,
                    36604183,36604183,
                    36604183,36604183,
                    35751987,35751987,
                    35751987,35751987,
                    35189713,35170579,
                    35170579,35170579,
                    35170579),
           Year = c(2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2019,
                    2019,2019,2019,2019,
                    2019,2019,2019,
                    2019),
         Course = c("PSY","MAA",
                    "ENG","ISL","REL",
                    "MAM","LIT","BLY",
                    "PES","MAA","HBY",
                    "ENG","ENG",
                    "PES","MAA","HBY",
                    "ENG","HBY","BLY",
                    "MAA","HBY","ENG",
                    "ENG","VAR","PSY",
                    "HIM","FSL","ENG",
                    "PHY","MAA","GEO",
                    "BLY","PSY",
                    "HBY","ENG","ENG",
                    "ACF","PES","MAA",
                    "HBY"),
     Sch_Assess = c(74,74,66,76,
                    67,51,61,76,77,
                    63,67,66,62,74,
                    72,72,62,62,70,
                    78,68,69,73,80,
                    59,70,54,64,52,
                    64,64,71,74,73,
                    75,64,67,60,70,
                    59),
   Final_Scaled = c(71,77.6,59.3,
                    66.7,62.4,55.3,
                    64.5,65.3,58.8,
                    60.5,57.8,57,57.3,
                    64,68.6,65,57.3,
                    55.9,63.8,81,62.9,
                    68.3,80.5,79.4,
                    55.6,77.3,47,55.7,
                    53.6,57.7,60.3,
                    65.1,77.2,70.5,
                    74.7,43,62.7,51.3,
                    68.2,59.3))

CurrentAssessmentData <- data.frame(stringsAsFactors = FALSE,
                              StudentNumber = c("27319775",
                                                "27401491","27489605",
                                                "27489605","27489605","27489605",
                                                "27489605","27134117","27134117",
                                                "27134117","27871377",
                                                "27134117","27134117","27401491",
                                                "27401491","27401491","27401491",
                                                "27401491","27871377",
                                                "27871377","27871377","27871377",
                                                "27871377","27201641","27201641",
                                                "27201641","27201641",
                                                "27201641","27201641","27439595",
                                                "27460045","27460045","27731331",
                                                "27731331","27731331",
                                                "27731331","27731331","27269972",
                                                "27269972","27269972"),
                        TimetablePeriodCode = c("2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2"),
                                     Course = c("IND","REL",
                                                "BLY","ENG","PSY","GEO",
                                                "MPA","MDT","LIT","MAM","CHE",
                                                "HBY","CHE","MPA","ENG",
                                                "FSL","DRA","PSY","HIM","MAM",
                                                "REL","ENG","PAL","REL","LIT",
                                                "PAL","MAM","FSL","HIM",
                                                "ENG","MAA","ENG","MAA","PSY",
                                                "ISL","ENG","HBY","HBY",
                                                "BLY","MAM"),
                                 Sch_Assess = c(86,67,50,
                                                65,51,67,71,83,62,48,55,
                                                59,54,76,73,80,68,55,65,
                                                50,63,63,63,70,69,79,59,
                                                75,61,62,51,49,67,58,71,
                                                64,65,74,75,71))

My goal is to create a lm() from the SchAssess group in TiscMaster (nesting by the Course column), so I can predict current student FinalScaled score using their SchAssess (which we already have), this prediction also needs to be grouped/nested by Course.

In the below example you can see that I have performed a regression analysis on just the Course 'ENG'. Then used the coefficient values to make a prediction for current students taking that course. To achieve this I have done 2 lots of filtering on Course vector using "ENG" .

rm(list = ls())

library(tidyverse)
library(magrittr)
library(broom)

# Filter both datasets
TiscMaster %>% 
  filter(Course == "ENG") -> ENG

CurrentAssessmentData %>% 
  filter(Course == "ENG") -> ENG_predict

# Calculate coefficients for the ENG dataframe
fit <- lm(Final_Scaled ~ Sch_Assess, data = ENG)
summary(fit)

predict(object = fit, newdata = data.frame(Sch_Assess = ENG_predict$Sch_Assess))

# Create new column in current student data with prediction
ENG_predict %>% 
  mutate(PredictedScaledScore = predict(fit, newdata = .)) %>%
  mutate(across(where(is.numeric), round, 1)) -> ENG_predict

I want to achieve the above but by grouping or nesting by Course (instead of creating manual filters on each course)


# This is a good nesting pipe
TiscMaster %>% 
  group_by(Course) %>% 
  nest() -> TiscNested

## And with the current Course Data!
CurrentAssessmentData %>% 
  group_by(Course) %>% 
  nest() -> CurrentAssessmentNested

# Mutate with prediction - Each course now has its own coefficient model
TiscNested %>% 
  mutate(models = map(data, function(df) lm(Final_Scaled ~ Sch_Assess, data = df))) -> TiscNested


# But how to apply these coefficient values into CurrentAssessmentNested as a new 'PredictedScaledScore' column, grouped by Course??

But now I am at the last hurdle: How do I create a new column in Current Students dataframe, that groups by Course, and uses the (also grouped by course) coefficients (slope & intercept), to provide a prediction for that student in that course?

Apologies for the big question, if you have read this far, I hope that you be able to help!

Many thanks,

Hi @loady003,
Welcome to the RStudio Community Forum.

Thanks for making the big effort to post a reproducible example.

I took the approach of standardizing the columns in the two dataframes and then row-binding them together. A quick look at the frequencies of students per Course shows that performing a course-based regression of historic scores is impossible for the majority of Courses (since the number of data points are too low: 0, 1, or 3). Maybe lumping the Courses into "subject areas" might help?

TiscMaster <- data.frame(stringsAsFactors = FALSE,
  StudentNumber = c(36546981,
                    36546981,36546981,
                    36553669,36553669,
                    36553669,36553669,
                    36566335,36566335,
                    36566335,36566335,
                    36566335,36587418,
                    36587418,36587418,
                    36587418,36590558,
                    36590558,36600083,
                    36600083,36600083,
                    36600083,36601371,
                    36601371,36601371,
                    36601371,36601371,
                    36604183,36604183,
                    36604183,36604183,
                    35751987,35751987,
                    35751987,35751987,
                    35189713,35170579,
                    35170579,35170579,
                    35170579),
           Year = c(2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2019,
                    2019,2019,2019,2019,
                    2019,2019,2019,
                    2019),
         Course = c("PSY","MAA",
                    "ENG","ISL","REL",
                    "MAM","LIT","BLY",
                    "PES","MAA","HBY",
                    "ENG","ENG",
                    "PES","MAA","HBY",
                    "ENG","HBY","BLY",
                    "MAA","HBY","ENG",
                    "ENG","VAR","PSY",
                    "HIM","FSL","ENG",
                    "PHY","MAA","GEO",
                    "BLY","PSY",
                    "HBY","ENG","ENG",
                    "ACF","PES","MAA",
                    "HBY"),
     Sch_Assess = c(74,74,66,76,
                    67,51,61,76,77,
                    63,67,66,62,74,
                    72,72,62,62,70,
                    78,68,69,73,80,
                    59,70,54,64,52,
                    64,64,71,74,73,
                    75,64,67,60,70,
                    59),
   Final_Scaled = c(71,77.6,59.3,
                    66.7,62.4,55.3,
                    64.5,65.3,58.8,
                    60.5,57.8,57,57.3,
                    64,68.6,65,57.3,
                    55.9,63.8,81,62.9,
                    68.3,80.5,79.4,
                    55.6,77.3,47,55.7,
                    53.6,57.7,60.3,
                    65.1,77.2,70.5,
                    74.7,43,62.7,51.3,
                    68.2,59.3))

CurrentAssessmentData <- data.frame(stringsAsFactors = FALSE,
                              StudentNumber = c("27319775",
                                                "27401491","27489605",
                                                "27489605","27489605","27489605",
                                                "27489605","27134117","27134117",
                                                "27134117","27871377",
                                                "27134117","27134117","27401491",
                                                "27401491","27401491","27401491",
                                                "27401491","27871377",
                                                "27871377","27871377","27871377",
                                                "27871377","27201641","27201641",
                                                "27201641","27201641",
                                                "27201641","27201641","27439595",
                                                "27460045","27460045","27731331",
                                                "27731331","27731331",
                                                "27731331","27731331","27269972",
                                                "27269972","27269972"),
                        TimetablePeriodCode = c("2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2"),
                                     Course = c("IND","REL",
                                                "BLY","ENG","PSY","GEO",
                                                "MPA","MDT","LIT","MAM","CHE",
                                                "HBY","CHE","MPA","ENG",
                                                "FSL","DRA","PSY","HIM","MAM",
                                                "REL","ENG","PAL","REL","LIT",
                                                "PAL","MAM","FSL","HIM",
                                                "ENG","MAA","ENG","MAA","PSY",
                                                "ISL","ENG","HBY","HBY",
                                                "BLY","MAM"),
                                 Sch_Assess = c(86,67,50,
                                                65,51,67,71,83,62,48,55,
                                                59,54,76,73,80,68,55,65,
                                                50,63,63,63,70,69,79,59,
                                                75,61,62,51,49,67,58,71,
                                                64,65,74,75,71))

head(TiscMaster, n=3)
#>   StudentNumber Year Course Sch_Assess Final_Scaled
#> 1      36546981 2020    PSY         74         71.0
#> 2      36546981 2020    MAA         74         77.6
#> 3      36546981 2020    ENG         66         59.3
head(CurrentAssessmentData, n=3)
#>   StudentNumber TimetablePeriodCode Course Sch_Assess
#> 1      27319775              2020S1    IND         86
#> 2      27401491              2020S1    REL         67
#> 3      27489605              2020S1    BLY         50

suppressPackageStartupMessages(library(tidyverse))

CurrentAssessmentData %>% 
  mutate(Year = 2021,
         type = "current",
         Final_Scaled = NA) %>% 
  select(StudentNumber, Year, Course, type, Sch_Assess, Final_Scaled) -> tmp2

TiscMaster %>% 
  mutate(type = "historic") %>% 
  select(StudentNumber, Year, Course, type, Sch_Assess, Final_Scaled) -> tmp1


full.df <- rbind(tmp1, tmp2)

levels(factor(full.df$Course))
#>  [1] "ACF" "BLY" "CHE" "DRA" "ENG" "FSL" "GEO" "HBY" "HIM" "IND" "ISL" "LIT"
#> [13] "MAA" "MAM" "MDT" "MPA" "PAL" "PES" "PHY" "PSY" "REL" "VAR"
table(full.df$Course, full.df$type)
#>      
#>       current historic
#>   ACF       0        1
#>   BLY       2        3
#>   CHE       2        0
#>   DRA       1        0
#>   ENG       6        9
#>   FSL       2        1
#>   GEO       1        1
#>   HBY       3        6
#>   HIM       2        1
#>   IND       1        0
#>   ISL       1        1
#>   LIT       2        1
#>   MAA       2        6
#>   MAM       4        1
#>   MDT       1        0
#>   MPA       2        0
#>   PAL       2        0
#>   PES       0        3
#>   PHY       0        1
#>   PSY       3        3
#>   REL       3        1
#>   VAR       0        1

Created on 2021-09-07 by the reprex package (v2.0.1)

Hi @DavoWW ,

Thanks for taking the time to respond. I only provided a small amount of observations for each data frame as an example. The actual data set for TiscMaster: 3171 observations from 2015 - 2020 split across 32 Courses. Would it help if I provided everything?

Thanks again.

Hi @DavoWW & all,

I have made some progress.

I now have a list of linear models. The list items are named after the Course that they hold coefficients for. I now just need to mutate a new column into CurrentAssessmentData that uses the 'list of linear models' to make the predictions dependent on the name of the Course variable that the list item is called:

rm(list = ls())

library(tidyverse)
library(readxl)
library(scales)
library(magrittr)
library(broom)

TiscMaster <- data.frame(stringsAsFactors = FALSE,
  StudentNumber = c(36546981,
                    36546981,36546981,
                    36553669,36553669,
                    36553669,36553669,
                    36566335,36566335,
                    36566335,36566335,
                    36566335,36587418,
                    36587418,36587418,
                    36587418,36590558,
                    36590558,36600083,
                    36600083,36600083,
                    36600083,36601371,
                    36601371,36601371,
                    36601371,36601371,
                    36604183,36604183,
                    36604183,36604183,
                    35751987,35751987,
                    35751987,35751987,
                    35189713,35170579,
                    35170579,35170579,
                    35170579),
           Year = c(2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2020,2020,
                    2020,2020,2020,
                    2020,2020,2019,
                    2019,2019,2019,2019,
                    2019,2019,2019,
                    2019),
         Course = c("PSY","MAA",
                    "ENG","ISL","REL",
                    "MAM","LIT","BLY",
                    "PES","MAA","HBY",
                    "ENG","ENG",
                    "PES","MAA","HBY",
                    "ENG","HBY","BLY",
                    "MAA","HBY","ENG",
                    "ENG","VAR","PSY",
                    "HIM","FSL","ENG",
                    "PHY","MAA","GEO",
                    "BLY","PSY",
                    "HBY","ENG","ENG",
                    "ACF","PES","MAA",
                    "HBY"),
     Sch_Assess = c(74,74,66,76,
                    67,51,61,76,77,
                    63,67,66,62,74,
                    72,72,62,62,70,
                    78,68,69,73,80,
                    59,70,54,64,52,
                    64,64,71,74,73,
                    75,64,67,60,70,
                    59),
   Final_Scaled = c(71,77.6,59.3,
                    66.7,62.4,55.3,
                    64.5,65.3,58.8,
                    60.5,57.8,57,57.3,
                    64,68.6,65,57.3,
                    55.9,63.8,81,62.9,
                    68.3,80.5,79.4,
                    55.6,77.3,47,55.7,
                    53.6,57.7,60.3,
                    65.1,77.2,70.5,
                    74.7,43,62.7,51.3,
                    68.2,59.3))

CurrentAssessmentData <- data.frame(stringsAsFactors = FALSE,
                              StudentNumber = c("27319775",
                                                "27401491","27489605",
                                                "27489605","27489605","27489605",
                                                "27489605","27134117","27134117",
                                                "27134117","27871377",
                                                "27134117","27134117","27401491",
                                                "27401491","27401491","27401491",
                                                "27401491","27871377",
                                                "27871377","27871377","27871377",
                                                "27871377","27201641","27201641",
                                                "27201641","27201641",
                                                "27201641","27201641","27439595",
                                                "27460045","27460045","27731331",
                                                "27731331","27731331",
                                                "27731331","27731331","27269972",
                                                "27269972","27269972"),
                        TimetablePeriodCode = c("2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S1","2020S1",
                                                "2020S1","2020S1","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2","2020S2","2020S2",
                                                "2020S2"),
                                     Course = c("IND","REL",
                                                "BLY","ENG","PSY","GEO",
                                                "MPA","MDT","LIT","MAM","CHE",
                                                "HBY","CHE","MPA","ENG",
                                                "FSL","DRA","PSY","HIM","MAM",
                                                "REL","ENG","PAL","REL","LIT",
                                                "PAL","MAM","FSL","HIM",
                                                "ENG","MAA","ENG","MAA","PSY",
                                                "ISL","ENG","HBY","HBY",
                                                "BLY","MAM"),
                                 Sch_Assess = c(86,67,50,
                                                65,51,67,71,83,62,48,55,
                                                59,54,76,73,80,68,55,65,
                                                50,63,63,63,70,69,79,59,
                                                75,61,62,51,49,67,58,71,
                                                64,65,74,75,71))


# This is a good nesting pipe
TiscMaster %>% 
  group_by(Course) %>% 
  nest() -> TiscNested

# Mutate with prediction - Each course now has its own coefficient model
TiscNested %>% 
  mutate(models = map(data, function(df) lm(Final_Scaled ~ Sch_Assess, data = df))) -> TiscNested

TiscNested %>% unnest(data) -> TiscNested


# This is a perfect case to use lapply function.
linear_model <- function(df) lm(Final_Scaled ~ Sch_Assess, data = df)

m <- lapply(split(TiscNested, TiscNested$Course),linear_model)


# Now, you have a list of linear models.
# Let's use it to predict y-value in CurrentAssessmentData for all 'Course' models:

What I am trying to manipulate is this:
CurrentAssessmentData %>%
mutate(PredictedScaledScore = predict(x, newdata = .)) #where x is the Course name in both the m list and the CurrentAssessmentData$Course

Thanks again for your time.