Correctly Using Window Functions in R

I have this time series data:

library(forecast)
library(lubridate)

set.seed(123)

weeks <- rep(seq(as.Date("2010-01-01"), as.Date("2023-01-01"), by = "week"), each = 1)
counts <- rpois(length(weeks), lambda = 50)
df <- data.frame(Week = as.character(weeks), Count = counts)

# Convert Week column to Date format
df$Week <- as.Date(df$Week)

# Create a time series object
ts_data <- ts(df$Count, frequency = 52, start = c(year(min(df$Week)), 1))

I want to fit a time series model to this data and implement a "rolling window cross validation"

As I understand, this involves (ordering the data in chronological order):

  • Fit a model to the first 60 data points, predict the next 5 and record the error (e.g. rmse, mae, mape)
  • Next, fit a model model to the first 65 points, predict the next 5 and record the error
  • etc.

I tried to apply this logic for my question to create the "chunks" of data required for training and testing the model:

x <- seq(from = 60, to = 679, by = 5)
y <- x[-length(x)] + 1

#train <- lapply(x, \(x) df[seq_len(x), ])
#test <- lapply(y, \(x) df[seq(from = x, by = 1, length.out = 5), ])

train <- lapply(x, function(x) {
    df[seq_len(x), ]
})

test <- lapply(y, function(x) {
    df[seq(from = x, by = 1, length.out = 5), ]
})

Then, I tried to run the models (training):

train_list = list()

for (i in 1:length(train))
{
    data_i = data.frame(train[i])
    data_i = ts(data_i$Count, frequency = 52, start = c(year(min(data_i$Week)), 1))
    fit_arima_i <- auto.arima( data_i , seasonal = TRUE, lambda = "auto")
train_list[[i]] = fit_arima_i
print(fit_arima_i)
}

And then testing:

 test_list = list()
mae = list()
rmse = list()
mape = list()

for (i in 1:length(test)) {
    tryCatch({
        test_i <- data.frame(test[i])
        fcast_i <- forecast(train_list[[i]], h = 5)
        mae[i] <- sum(abs(as.numeric(fcast_i$mean) - test_i$Count))
        rmse[i] <- sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2))
        mape[i] <- mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100
        #print(c(sum(abs(as.numeric(fcast_i$mean) - test_i$Count)), sqrt(mean((as.numeric(fcast_i$mean) - test_i$Count)^2)), mean(abs((as.numeric(fcast_i$mean) - test_i$Count) / test_i$Count)) * 100))
    }, error = function(e) {
        # Handle the error here
        message(paste0("Error occurred for i = ", i, ": ", e))
    })
}

Can someone please tell me if I am doing this correctly?

Thanks!

Note 1: An extra approach using matrices (to store errors up to 5 forecasts ahead)

rmse = matrix(0, nrow = length(test) , ncol = 5)
mae = matrix(0, nrow = length(test) , ncol = 5)
mape = matrix(0, nrow = length(test) , ncol = 5)    

for (i in 1:length(test)) {
    tryCatch({
        test_i <- data.frame(test[i])
        fcast_1_i <- forecast(train_list[[i]], h = 1)
        
        fcast_2_i <- forecast(train_list[[i]], h = 2)
        fcast_3_i <- forecast(train_list[[i]], h = 3)
        fcast_4_i <- forecast(train_list[[i]], h = 4)
        fcast_5_i <- forecast(train_list[[i]], h = 5)
        
        mae[i,1] <- sum(abs(as.numeric(fcast_1_i$mean) -  test_i[1,2]))
        rmse[i,1] <- sqrt(mean((as.numeric(fcast_1_i$mean) -  test_i[1,2])^2))
        mape[i,1] <- mean(abs((as.numeric(fcast_1_i$mean) -  test_i[1,2]) /  test_i[1,2])) * 100
        
        
        mae[i,2] <- sum(abs(as.numeric(fcast_2_i$mean) -  test_i[1:2,2]))
        rmse[i,2] <- sqrt(mean((as.numeric(fcast_2_i$mean) -  test_i[1:2,2])^2))
        mape[i,2] <- mean(abs((as.numeric(fcast_2_i$mean) -  test_i[1:2,2]) /  test_i[1:2,2])) * 100
        
        mae[i,3] <- sum(abs(as.numeric(fcast_3_i$mean) -  test_i[1:3,2]))
        rmse[i,3] <- sqrt(mean((as.numeric(fcast_3_i$mean) -  test_i[1:3,2])^2))
        mape[i,3] <- mean(abs((as.numeric(fcast_3_i$mean) -  test_i[1:3,2]) /  test_i[1:3,2])) * 100
        
                   
        mae[i,4] <- sum(abs(as.numeric(fcast_4_i$mean) -  test_i[1:4,2]))
        rmse[i,4] <- sqrt(mean((as.numeric(fcast_4_i$mean) -  test_i[1:4,2])^2))
        mape[i,4] <- mean(abs((as.numeric(fcast_4_i$mean) -  test_i[1:4,2]) /  test_i[1:4,2])) * 100
                    
        
        mae[i,5] <- sum(abs(as.numeric(fcast_5_i$mean) - test_i$Count))
        rmse[i,5] <- sqrt(mean((as.numeric(fcast_5_i$mean) - test_i$Count)^2))
        mape[i,5] <- mean(abs((as.numeric(fcast_5_i$mean) - test_i$Count) / test_i$Count)) * 100
        
        
    }, error = function(e) {
        # Handle the error here
        message(paste0("Error occurred for i = ", i, ": ", e))
    })
} 

Note 2: The same approach using sub-loops for efficiency:

n_horizon <- 5
rmse <- matrix(0, nrow = length(test), ncol = n_horizon)
mae <- matrix(0, nrow = length(test), ncol = n_horizon)
mape <- matrix(0, nrow = length(test), ncol = n_horizon)    

for (i in 1:length(test)) {
  tryCatch({
    test_i <- data.frame(test[i])
    fcast <- list()
    
    for (h in 1:n_horizon) {
      fcast[[h]] <- forecast(train_list[[i]], h = h)
      mae[i, h] <- sum(abs(as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]))
      rmse[i, h] <- sqrt(mean((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2])^2))
      mape[i, h] <- mean(abs((as.numeric(fcast[[h]]$mean) - test_i[1:h, 2]) / test_i[1:h, 2])) * 100
    }
    
  }, error = function(e) {
    message(paste0("Error occurred for i = ", i, ": ", e))
  })
}

This is really a response to your earlier post. I assume you are modeling after someone’s approach for monthly data. Starting with the first 60 months means you will have five years, which is enough to get reasonably accurate seasonal estimates. Using weekly data you have just one year plus two months.

If you use the first 60 weeks to forecast the next 5 weeks, you would follow that by using the first 61 weeks to forecast the next five weeks and so on.

For more on cross validation for time series look at section 5.10 in the third edition of Forecasting Principles and Practice (free online). Rob Hyndman is the creator of the Forecast package and its successor the Tidyverts set of packages.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.