For multiple regressive analysis, Hyndman recommends backward step selection:
- Start with the model containing all potential predictors.
- Remove one predictor at a time. Keep the model if it improves the measure of predictive accuracy.
- Iterate until no further improvement.
In the snippet workflow below, that requires only adapting the fit_sat function.
# Libraries
suppressPackageStartupMessages({
library(feasts)
library(fpp3)
library(GGally)
library(ggplot2)
library(patchwork)
})
# functions
fit_sat <- function(x) {
x %>% model(tslm = TSLM(Sales ~
Factor_1 + Factor_2 + Factor_3 + Factor_4 + Factor_5),
tslmplus = TSLM(Sales ~
Factor_1 + Factor_2 + Factor_3 + Factor_4 + Factor_5 +
trend() + season()),
arima = ARIMA(Sales ~ Factor_1 + Factor_2 + Factor_3 + Factor_4 + Factor_5))
}
make_DF <- function(x) left_join(tsib, residuals(fit %>% select(x)), by = "Month")
plot_resids <- function(DF, Factor_1, .resid, Factor_2, Factor_3, Factor_4, Factor_5) {
p1 <- ggplot(DF, aes(x = Factor_1, y = .resid)) +
geom_point() + labs(y = "Residuals") + theme_minimal()
p2 <- ggplot(DF, aes(x = Factor_2, y = .resid)) +
geom_point() + labs(y = "Residuals") + theme_minimal()
p3 <- ggplot(DF, aes(x = Factor_3, y = .resid)) +
geom_point() + labs(y = "Residuals") + theme_minimal()
p4 <- ggplot(DF, aes(x = Factor_4, y = .resid)) +
geom_point() + labs(y = "Residuals") + theme_minimal()
p5 <- ggplot(DF, aes(x = Factor_5, y = .resid)) +
geom_point() + labs(y = "Residuals") + theme_minimal()
p6 <- ggplot(DF) + theme_minimal()
(p1 | p2) / (p3 | p4) / (p5 | p6)
}
show_lb <- function(x) {
fit %>%
select(x) %>%
augment() %>%
features(.innov, ljung_box, lag = 6, dof = 5)
}
show_metrics <- function(x) {
fit %>%
select(x) %>%
glance() %>%
t()
}
vis_corr <- function(x) x %>% ggpairs(columns = 2:6) + theme_minimal()
viz_fit <- function(x) {
fit %>%
select(x) %>%
augment() %>%
ggplot(aes(x = Month)) +
geom_line(aes(y = Sales, colour = "Data")) +
geom_line(aes(y = .fitted, colour = "Fitted")) +
labs(y = NULL, title = "Sales") +
scale_color_manual(values=c(Data="black",Fitted="red")) +
guides(colour = guide_legend(title = NULL)) +
theme_minimal()
}
vis_init <- function( x) autoplot(x) + theme_minimal()
vis_resids <- function(x) {
fit %>%
select(x) %>%
gg_tsresiduals()
}
# Sample Data:
# tips:
# don't use df, data and date because these are built-in funtion names; some operations may treat them as closures
# avoid having to tick-quote names of variables; when time comes for presentation, the column headers can easily be changed.
tsib <- tsibble(
Month = yearmonth("2019 Jan") + 0:23,
Sales = c(
7183,
6610, 8206, 8565, 9670, 9787, 9115, 11883,
9466, 9862, 6275, 9298, 9010, 8944, 7036, 2783,
5497, 8595, 7792, 8965, 12561, 11450, 8756,
9473
),
Factor_1 = c(
19070,
20301, 19056, 18915, 19546, 20399, 20388,
19126, 18174, 17104, 17536, 15443, 15488, 16651,
14724, 15082, 14371, 14416, 14865, 14525,
13887, 15968, 18994, 25454
),
Factor_2 = c(
20364,
20545, 20109, 20080, 21193, 22256 , 22755,
25551, 27215, 28535, 27251, 28197, 29524, 29921,
34792, 28715, 26983, 27009, 26988, 27232,
29185, 30070, 29144, 27335
),
Factor_4 = c(
423,
548, 365, 452, 479, 486, 487, 570, 563, 642,
904, 730, 994, 1432, 1846, 1926, 997, 464, 385,
331, 470, 328, 405, 412
),
Factor_3 = c(
7099,
7830, 6955, 8396, 10306, 10054, 9235, 10597,
8507, 8788, 6705, 9208, 9057, 10078, 5327,
3185, 4709, 8895, 8564, 9496, 12277, 14540,
11984, 15992
),
Factor_5 = c(
5726,
6419, 8685, 8760, 8639, 8703, 8407, 9150, 7849,
8721, 7009, 10320, 7527, 8518, 7244, 8512,
7181, 8721, 7467, 9342, 10824, 11753, 9449,
11373
)
)
#> Using `Month` as index variable.
# initial visualization
vis_init(tsib)
#> Plot variable not specified, automatically selected `.vars = Sales`

# correlations
vis_corr(tsib)

# saturated models
fit_sat(tsib) -> fit
# visualize fits
viz_fit("tslm")
#> Note: Using an external vector in selections is ambiguous.
#> ℹ Use `all_of(x)` instead of `x` to silence this message.
#> ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.

viz_fit("tslmplus")

viz_fit("arima")

# show metrics
show_metrics("tslm")
#> [,1]
#> .model "tslm"
#> r_squared "0.7024474"
#> adj_r_squared "0.6197938"
#> sigma2 "1645370"
#> statistic "8.498699"
#> p_value "0.0002827262"
#> df "6"
#> log_lik "-202.3641"
#> AIC "350.6191"
#> AICc "357.6191"
#> BIC "358.8654"
#> CV "3119113"
#> deviance "29616668"
#> df.residual "18"
#> rank "6"
show_metrics("tslmplus")
#> [,1]
#> .model "tslmplus"
#> r_squared "0.9721229"
#> adj_r_squared "0.8931377"
#> sigma2 "462454.4"
#> statistic "12.30767"
#> p_value "0.00261472"
#> df "18"
#> log_lik "-173.9506"
#> AIC "317.7922"
#> AICc "507.7922"
#> BIC "340.1752"
#> CV "2599013"
#> deviance "2774726"
#> df.residual "6"
#> rank "18"
show_metrics("arima")
#> [,1]
#> .model "arima"
#> sigma2 924925.9
#> log_lik -95.29308
#> AIC 204.5862
#> AICc 232.5862
#> BIC 207.9805
#> ar_roots Complex,0
#> ma_roots Complex,0
# visualize residuals
vis_resids("tslm")

vis_resids("tslmplus")

vis_resids("arima")

# portmanteau check
show_lb("tslm")
#> # A tibble: 1 x 3
#> .model lb_stat lb_pvalue
#> <chr> <dbl> <dbl>
#> 1 tslm 7.41 0.00649
show_lb("tslmplus")
#> # A tibble: 1 x 3
#> .model lb_stat lb_pvalue
#> <chr> <dbl> <dbl>
#> 1 tslmplus 20.1 0.00000729
show_lb("arima")
#> # A tibble: 1 x 3
#> .model lb_stat lb_pvalue
#> <chr> <dbl> <dbl>
#> 1 arima 15.8 0.0000699
# actual vs residuals plots
plot_resids(make_DF("tslm"))

plot_resids(make_DF("tslmplus"))

plot_resids(make_DF("arima"))
