forecast multiple ARIMA models after train and test R

MWeith · June 18, 2020, 10:13pm

I need to forecast over 100k of timeseries with ARIMA, this is a sample of my data:

library(yardstick)
library(forecast)
library(tsibble)
df<-tibble::tribble(
      ~ID, ~Period,  ~Value,
       1L, 201612L, -19188L,
       1L, 201701L,  -8805L,
       1L, 201702L,   5092L,
       1L, 201703L,   4587L,
       1L, 201704L,  -6083L,
       1L, 201705L, -15308L,
       1L, 201706L, -12004L,
       1L, 201707L,   -791L,
       1L, 201708L,  -5151L,
       1L, 201709L,   3312L,
       1L, 201710L,   7728L,
       1L, 201711L, -20823L,
       1L, 201712L,    901L,
       1L, 201801L,   7713L,
       1L, 201802L,   4506L,
       1L, 201803L,  24475L,
       1L, 201804L, -12418L,
       1L, 201805L,  14545L,
       1L, 201806L, -14233L,
       1L, 201807L,   1271L,
       1L, 201808L, -19064L,
       1L, 201809L,  -3018L,
       1L, 201810L,  13291L,
       1L, 201811L,   7111L,
       1L, 201812L, -16961L,
       1L, 201901L,  -2442L,
       1L, 201902L,  -6861L,
       1L, 201903L,   1819L,
       1L, 201904L,   8759L,
       1L, 201905L,  -9220L,
       1L, 201906L,  -9786L,
       1L, 201907L,  -8620L,
       1L, 201908L, -47736L,
       1L, 201909L,  -2586L,
       1L, 201910L,  12347L,
       1L, 201911L,  19758L,
       1L, 201912L,   4669L,
       1L, 202001L,   1499L,
       2L, 201612L,  -6146L,
       2L, 201701L,    321L,
       2L, 201702L,  20859L,
       2L, 201703L,  -7533L,
       2L, 201704L,     72L,
       2L, 201705L,  17915L,
       2L, 201706L,   -985L,
       2L, 201707L,   -832L,
       2L, 201708L,  -1773L,
       2L, 201709L,  -2532L,
       2L, 201710L,   2280L,
       2L, 201711L, -18821L,
       2L, 201712L,  16445L,
       2L, 201801L,   1660L,
       2L, 201802L,  -1857L,
       2L, 201803L,   3221L,
       2L, 201804L, -11009L,
       2L, 201805L, -11945L,
       2L, 201806L,  -7152L,
       2L, 201807L,  -3201L,
       2L, 201808L, -13226L,
       2L, 201809L, -13568L,
       2L, 201810L, -11952L,
       2L, 201811L,   1276L,
       2L, 201812L, -20049L,
       2L, 201901L,  -7576L,
       2L, 201902L, -10370L,
       2L, 201903L,  47760L,
       2L, 201904L, -37809L,
       2L, 201905L,  -9232L,
       2L, 201906L, -18635L,
       2L, 201907L,  -6548L,
       2L, 201908L, -29065L,
       2L, 201909L,  -2225L,
       2L, 201910L,   3613L,
       2L, 201911L, -11113L,
       2L, 201912L,   4626L,
       2L, 202001L, -12083L,
       3L, 201612L,  -5602L,
       3L, 201701L,   -692L,
       3L, 201702L,   1152L,
       3L, 201703L,   -378L,
       3L, 201704L,  -2342L,
       3L, 201705L,   1059L,
       3L, 201706L, -11490L,
       3L, 201707L,   -261L,
       3L, 201708L,   1703L,
       3L, 201709L,  -6968L,
       3L, 201710L,   6915L,
       3L, 201711L,  -6320L,
       3L, 201712L, -19468L,
       3L, 201801L, -16850L,
       3L, 201802L,  -9559L,
       3L, 201803L,  -6727L,
       3L, 201804L, -29877L,
       3L, 201805L,   7453L,
       3L, 201806L, -11100L,
       3L, 201807L,  14289L,
       3L, 201808L, -16686L,
       3L, 201809L, -17925L,
       3L, 201810L,  -2381L,
       3L, 201811L, -25015L,
       3L, 201812L, -20258L,
       3L, 201901L, -12875L,
       3L, 201902L,  -8534L,
       3L, 201903L,  -3880L,
       3L, 201904L, -27034L,
       3L, 201905L, -13624L,
       3L, 201906L, -29521L,
       3L, 201907L,  -4933L,
       3L, 201908L,  -5963L,
       3L, 201909L, -15193L,
       3L, 201910L,  -2960L,
       3L, 201911L,   6150L,
       3L, 201912L,  18957L,
       3L, 202001L, -10326L
      )

Some treatment to work with Arima:

df$year<-as.numeric(substr(df$Period,start = 1,stop = 4))
df$month<-as.numeric(substr(df$Period,start=5,stop=6))
df$day<-1

df <- df %>% 
  mutate(date=as.character(make_date(year,month,day))) 

df<-df %>% 
  mutate(YearMonth = tsibble::yearmonth((ymd(date)))) %>%
  as_tsibble(key=ID,index = YearMonth)

Now I separate train and test data. I use until 2018 as train, and then 2019 as test (including 2020 January)

 df_train<- df %>% 
      filter(YearMonth <= yearmonth("2018 Dec")) %>%
      model(ARIMA(Value ~ PDQ(0,0,0), stepwise=FALSE, approximation=FALSE))

    df_test<-df_train %>%
      forecast(h = 13) %>%
      accuracy(df)

Now I need to forecast each time serie, from Feb 2020 till Feb 2021. I don't know how to apply forecast or Arima from Forecast package because the arguments use a univariate time series of class ts. Is there another solution? Does anyone how to do it for each of them? Thanks for the help!

williaml · June 18, 2020, 10:47pm

Have you had a look at the tidyverts packages fable, tsibble, and feasts?

https://tidyverts.org/

mitchelloharawild · June 19, 2020, 3:38am

When using the fable package you don't need to load the forecast package to produce forecasts.

To forecast 1 year beyond the df dataset with a non-seasonal ARIMA() model, you could use:

library(fable)
df %>% 
  ARIMA(Value ~ PDQ(0,0,0), stepwise=FALSE, approximation=FALSE)) %>%
  forecast(h = "1 year")

MWeith · June 19, 2020, 3:57am

But then if I use all the data will I get the same results as if I do it with a train test?

mitchelloharawild · June 19, 2020, 3:59am

The produced forecasts are based on the provided data. You will get different results as the forecasts are now using all available data, rather than only the data in the training set.

williaml · June 19, 2020, 4:00am

You are trying to use time series data on a machine learning problem. They are two different things and probably not something you should mix.

system · July 10, 2020, 4:09am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.