Read year week as Date for Time series

Hi All,

My data frame has this column Year Week whose values are string- example- "201901". Here the 01 represent first week of 2019.
I intend to change this to a date format.
Could any one please help me with this?
As I'd want "201901" to be read as the Year week 2019 week 01 as I intend to perform a time series analysis on this.
Please advise
Thank you

It's preferable to use the frequency argument to construct a time series over converting date strings.

dat <- c(LETTERS,LETTERS)
my_ts <- ts(dat, start = c(2019,1), frequency = 52)
my_ts
#> Time Series:
#> Start = c(2019, 1) 
#> End = c(2019, 52) 
#> Frequency = 52 
#>  [1] A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L
#> [39] M N O P Q R S T U V W X Y Z

Thank you. I have tried that but the problem is i have 53 weeks for some years. thats why im seeking options to read my data directly into year week. Is there any possibility for that?
I have attached my data frame here

Mydf<-structure(list(YearWeek = c("201901", "201902", "201903", "201904", 
"201905", "201906", "201907", "201908", "201909", "201910", "201911", 
"201912", "201913", "201914", "201915", "201916", "201917", "201918", 
"201919", "201920", "201921", "201922", "201923", "201924", "201925", 
"201926", "201927", "201928", "201929", "201930", "201931", "201932", 
"201933", "201934", "201935", "201936", "201937", "201938", "201939", 
"201940", "201941", "201942", "201943", "201944", "201945", "201946", 
"201947", "201948", "201949", "201950", "201951", "201952", "202001", 
"202002", "202003", "202004", "202005", "202006", "202007", "202008", 
"202009", "202010", "202011", "202012", "202013", "202014", "202015", 
"202016", "202017", "202018", "202019", "202020", "202021", "202022", 
"202023", "202024", "202025", "202026", "202027", "202028", "202029", 
"202030", "202031", "202032", "202033", "202034", "202035", "202036", 
"202037", "202038", "202039", "202040", "202041", "202042", "202043", 
"202044", "202045", "202046", "202047", "202048", "202049", "202050", 
"202051", "202052", "202053", "202101", "202102", "202103", "202104", 
"202105", "202106", "202107", "202108", "202109", "202110", "202111", 
"202112", "202113", "202114", "202115", "202116", "202117", "202118", 
"202119", "202120", "202121", "202122", "202123", "202124", "202125", 
"202126", "202127", "202128", "202129", "202130", "202131", "202132", 
"202133", "202134", "202135", "202136", "202137", "202138", "202139", 
"202140", "202141", "202142", "202143"), Shipment = c(418, 1442, 
1115, 1203, 1192, 1353, 1191, 1411, 933, 1384, 1362, 1353, 1739, 
1751, 1595, 1380, 1711, 2058, 1843, 1602, 2195, 2159, 2009, 1812, 
2195, 1763, 821, 1892, 1781, 2071, 1789, 1789, 1732, 1384, 1435, 
1247, 1839, 2034, 1963, 1599, 1596, 1548, 1084, 1350, 1856, 1882, 
1979, 1021, 1311, 2031, 1547, 591, 724, 1535, 1268, 1021, 1269, 
1763, 1275, 1411, 1847, 1379, 1606, 1473, 1180, 926, 800, 840, 
1375, 1755, 1902, 1921, 1743, 1275, 1425, 1088, 1416, 1168, 842, 
1185, 1570, 1435, 1209, 1470, 1368, 1926, 1233, 1189, 1245, 1465, 
1226, 887, 1489, 1369, 1358, 1179, 1200, 1226, 1066, 823, 1913, 
2308, 1842, 910, 794, 1098, 1557, 1417, 1851, 1876, 1010, 160, 
1803, 1607, 1185, 1347, 1700, 981, 1191, 1058, 1464, 1513, 1333, 
1169, 1294, 978, 962, 1254, 987, 1290, 758, 436, 579, 636, 614, 
906, 982, 649, 564, 502, 274, 473, 506, 902, 639, 810, 398, 488
), Production = c(0, 198, 1436, 1055, 1396, 1330, 1460, 1628, 
1513, 1673, 1737, 1274, 1726, 1591, 2094, 1411, 2009, 1909, 1759, 
1693, 1748, 1455, 2078, 1717, 1737, 1886, 862, 1382, 1779, 1423, 
1460, 1454, 1347, 1409, 1203, 1235, 1397, 1563, 1411, 1455, 1706, 
688, 1446, 1336, 1618, 1404, 1759, 746, 1560, 1665, 1317, 0, 
441, 1390, 1392, 1180, 1477, 1265, 1485, 1495, 1543, 1584, 1575, 
1609, 1233, 1420, 908, 1008, 1586, 1392, 1385, 1259, 1010, 973, 
1053, 905, 1101, 1196, 891, 1033, 925, 889, 1136, 1058, 1179, 
1047, 967, 900, 904, 986, 1014, 945, 1030, 1066, 1191, 1143, 
1292, 574, 1174, 515, 1296, 1315, 1241, 0, 0, 1182, 1052, 1107, 
1207, 1254, 1055, 258, 1471, 1344, 1353, 1265, 1444, 791, 1397, 
1186, 1264, 1032, 949, 1059, 954, 798, 956, 1074, 1136, 1209, 
975, 833, 994, 1127, 1153, 1202, 1234, 1336, 1484, 1515, 1151, 
1175, 976, 1135, 1272, 869, 1900, 1173)), row.names = c(NA, 148L
), class = "data.frame")

The problem in weekly data is that there are not actually 52 weeks in the year, more like 52.18. See Hyndman ยง12.1. Displaying the data values won't fix that.

Mydf <- data.frame(YearWeek = c(
  "201901", "201902", "201903", "201904",
  "201905", "201906", "201907", "201908", "201909", "201910", "201911",
  "201912", "201913", "201914", "201915", "201916", "201917", "201918",
  "201919", "201920", "201921", "201922", "201923", "201924", "201925",
  "201926", "201927", "201928", "201929", "201930", "201931", "201932",
  "201933", "201934", "201935", "201936", "201937", "201938", "201939",
  "201940", "201941", "201942", "201943", "201944", "201945", "201946",
  "201947", "201948", "201949", "201950", "201951", "201952", "202001",
  "202002", "202003", "202004", "202005", "202006", "202007", "202008",
  "202009", "202010", "202011", "202012", "202013", "202014", "202015",
  "202016", "202017", "202018", "202019", "202020", "202021", "202022",
  "202023", "202024", "202025", "202026", "202027", "202028", "202029",
  "202030", "202031", "202032", "202033", "202034", "202035", "202036",
  "202037", "202038", "202039", "202040", "202041", "202042", "202043",
  "202044", "202045", "202046", "202047", "202048", "202049", "202050",
  "202051", "202052", "202053", "202101", "202102", "202103", "202104",
  "202105", "202106", "202107", "202108", "202109", "202110", "202111",
  "202112", "202113", "202114", "202115", "202116", "202117", "202118",
  "202119", "202120", "202121", "202122", "202123", "202124", "202125",
  "202126", "202127", "202128", "202129", "202130", "202131", "202132",
  "202133", "202134", "202135", "202136", "202137", "202138", "202139",
  "202140", "202141", "202142", "202143"
), Shipment = c(
  418, 1442,
  1115, 1203, 1192, 1353, 1191, 1411, 933, 1384, 1362, 1353, 1739,
  1751, 1595, 1380, 1711, 2058, 1843, 1602, 2195, 2159, 2009, 1812,
  2195, 1763, 821, 1892, 1781, 2071, 1789, 1789, 1732, 1384, 1435,
  1247, 1839, 2034, 1963, 1599, 1596, 1548, 1084, 1350, 1856, 1882,
  1979, 1021, 1311, 2031, 1547, 591, 724, 1535, 1268, 1021, 1269,
  1763, 1275, 1411, 1847, 1379, 1606, 1473, 1180, 926, 800, 840,
  1375, 1755, 1902, 1921, 1743, 1275, 1425, 1088, 1416, 1168, 842,
  1185, 1570, 1435, 1209, 1470, 1368, 1926, 1233, 1189, 1245, 1465,
  1226, 887, 1489, 1369, 1358, 1179, 1200, 1226, 1066, 823, 1913,
  2308, 1842, 910, 794, 1098, 1557, 1417, 1851, 1876, 1010, 160,
  1803, 1607, 1185, 1347, 1700, 981, 1191, 1058, 1464, 1513, 1333,
  1169, 1294, 978, 962, 1254, 987, 1290, 758, 436, 579, 636, 614,
  906, 982, 649, 564, 502, 274, 473, 506, 902, 639, 810, 398, 488
), Production = c(
  0, 198, 1436, 1055, 1396, 1330, 1460, 1628,
  1513, 1673, 1737, 1274, 1726, 1591, 2094, 1411, 2009, 1909, 1759,
  1693, 1748, 1455, 2078, 1717, 1737, 1886, 862, 1382, 1779, 1423,
  1460, 1454, 1347, 1409, 1203, 1235, 1397, 1563, 1411, 1455, 1706,
  688, 1446, 1336, 1618, 1404, 1759, 746, 1560, 1665, 1317, 0,
  441, 1390, 1392, 1180, 1477, 1265, 1485, 1495, 1543, 1584, 1575,
  1609, 1233, 1420, 908, 1008, 1586, 1392, 1385, 1259, 1010, 973,
  1053, 905, 1101, 1196, 891, 1033, 925, 889, 1136, 1058, 1179,
  1047, 967, 900, 904, 986, 1014, 945, 1030, 1066, 1191, 1143,
  1292, 574, 1174, 515, 1296, 1315, 1241, 0, 0, 1182, 1052, 1107,
  1207, 1254, 1055, 258, 1471, 1344, 1353, 1265, 1444, 791, 1397,
  1186, 1264, 1032, 949, 1059, 954, 798, 956, 1074, 1136, 1209,
  975, 833, 994, 1127, 1153, 1202, 1234, 1336, 1484, 1515, 1151,
  1175, 976, 1135, 1272, 869, 1900, 1173
))

my_ts <- ts(Mydf, start = c(2019, 1), frequency = 52)
my_ts
#> Time Series:
#> Start = c(2019, 1) 
#> End = c(2021, 44) 
#> Frequency = 52 
#>          YearWeek Shipment Production
#> 2019.000        1      418          0
#> 2019.019        2     1442        198
#> 2019.038        3     1115       1436
#> 2019.058        4     1203       1055
#> 2019.077        5     1192       1396
#> 2019.096        6     1353       1330
#> 2019.115        7     1191       1460
#> 2019.135        8     1411       1628
#> 2019.154        9      933       1513
#> 2019.173       10     1384       1673
#> 2019.192       11     1362       1737
#> 2019.212       12     1353       1274
#> 2019.231       13     1739       1726
#> 2019.250       14     1751       1591
#> 2019.269       15     1595       2094
#> 2019.288       16     1380       1411
#> 2019.308       17     1711       2009
#> 2019.327       18     2058       1909
#> 2019.346       19     1843       1759
#> 2019.365       20     1602       1693
#> 2019.385       21     2195       1748
#> 2019.404       22     2159       1455
#> 2019.423       23     2009       2078
#> 2019.442       24     1812       1717
#> 2019.462       25     2195       1737
#> 2019.481       26     1763       1886
#> 2019.500       27      821        862
#> 2019.519       28     1892       1382
#> 2019.538       29     1781       1779
#> 2019.558       30     2071       1423
#> 2019.577       31     1789       1460
#> 2019.596       32     1789       1454
#> 2019.615       33     1732       1347
#> 2019.635       34     1384       1409
#> 2019.654       35     1435       1203
#> 2019.673       36     1247       1235
#> 2019.692       37     1839       1397
#> 2019.712       38     2034       1563
#> 2019.731       39     1963       1411
#> 2019.750       40     1599       1455
#> 2019.769       41     1596       1706
#> 2019.788       42     1548        688
#> 2019.808       43     1084       1446
#> 2019.827       44     1350       1336
#> 2019.846       45     1856       1618
#> 2019.865       46     1882       1404
#> 2019.885       47     1979       1759
#> 2019.904       48     1021        746
#> 2019.923       49     1311       1560
#> 2019.942       50     2031       1665
#> 2019.962       51     1547       1317
#> 2019.981       52      591          0
#> 2020.000       53      724        441
#> 2020.019       54     1535       1390
#> 2020.038       55     1268       1392
#> 2020.058       56     1021       1180
#> 2020.077       57     1269       1477
#> 2020.096       58     1763       1265
#> 2020.115       59     1275       1485
#> 2020.135       60     1411       1495
#> 2020.154       61     1847       1543
#> 2020.173       62     1379       1584
#> 2020.192       63     1606       1575
#> 2020.212       64     1473       1609
#> 2020.231       65     1180       1233
#> 2020.250       66      926       1420
#> 2020.269       67      800        908
#> 2020.288       68      840       1008
#> 2020.308       69     1375       1586
#> 2020.327       70     1755       1392
#> 2020.346       71     1902       1385
#> 2020.365       72     1921       1259
#> 2020.385       73     1743       1010
#> 2020.404       74     1275        973
#> 2020.423       75     1425       1053
#> 2020.442       76     1088        905
#> 2020.462       77     1416       1101
#> 2020.481       78     1168       1196
#> 2020.500       79      842        891
#> 2020.519       80     1185       1033
#> 2020.538       81     1570        925
#> 2020.558       82     1435        889
#> 2020.577       83     1209       1136
#> 2020.596       84     1470       1058
#> 2020.615       85     1368       1179
#> 2020.635       86     1926       1047
#> 2020.654       87     1233        967
#> 2020.673       88     1189        900
#> 2020.692       89     1245        904
#> 2020.712       90     1465        986
#> 2020.731       91     1226       1014
#> 2020.750       92      887        945
#> 2020.769       93     1489       1030
#> 2020.788       94     1369       1066
#> 2020.808       95     1358       1191
#> 2020.827       96     1179       1143
#> 2020.846       97     1200       1292
#> 2020.865       98     1226        574
#> 2020.885       99     1066       1174
#> 2020.904      100      823        515
#> 2020.923      101     1913       1296
#> 2020.942      102     2308       1315
#> 2020.962      103     1842       1241
#> 2020.981      104      910          0
#> 2021.000      105      794          0
#> 2021.019      106     1098       1182
#> 2021.038      107     1557       1052
#> 2021.058      108     1417       1107
#> 2021.077      109     1851       1207
#> 2021.096      110     1876       1254
#> 2021.115      111     1010       1055
#> 2021.135      112      160        258
#> 2021.154      113     1803       1471
#> 2021.173      114     1607       1344
#> 2021.192      115     1185       1353
#> 2021.212      116     1347       1265
#> 2021.231      117     1700       1444
#> 2021.250      118      981        791
#> 2021.269      119     1191       1397
#> 2021.288      120     1058       1186
#> 2021.308      121     1464       1264
#> 2021.327      122     1513       1032
#> 2021.346      123     1333        949
#> 2021.365      124     1169       1059
#> 2021.385      125     1294        954
#> 2021.404      126      978        798
#> 2021.423      127      962        956
#> 2021.442      128     1254       1074
#> 2021.462      129      987       1136
#> 2021.481      130     1290       1209
#> 2021.500      131      758        975
#> 2021.519      132      436        833
#> 2021.538      133      579        994
#> 2021.558      134      636       1127
#> 2021.577      135      614       1153
#> 2021.596      136      906       1202
#> 2021.615      137      982       1234
#> 2021.635      138      649       1336
#> 2021.654      139      564       1484
#> 2021.673      140      502       1515
#> 2021.692      141      274       1151
#> 2021.712      142      473       1175
#> 2021.731      143      506        976
#> 2021.750      144      902       1135
#> 2021.769      145      639       1272
#> 2021.788      146      810        869
#> 2021.808      147      398       1900
#> 2021.827      148      488       1173

# create univariate time series
shipment <- ts(Mydf$Shipment, start = c(2019,1), frequency = 52)
production <- ts(Mydf$Production, start = c(2019,1), frequency = 52)
library(fpp2)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
#> โ”€โ”€ Attaching packages โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ fpp2 2.4 โ”€โ”€
#> โœ“ ggplot2   3.3.5     โœ“ fma       2.4  
#> โœ“ forecast  8.15      โœ“ expsmooth 2.3
#> 

shipment %>% stlf() %>% autoplot() + theme_minimal()

production %>% stlf() %>% autoplot() + theme_minimal()

1 Like

You can convert strings to dates with strptime() (and more functionalities in the package lubridate). It uses a format argument where you can specify what the date you want to read looks like. You would be interested in:

  • %Y
    Year with century. Note that whereas there was no zero in the original Gregorian calendar, ISO 8601:2004 defines it to be valid (interpreted as 1BC): see Year zero - Wikipedia. However, the standards also say that years before 1582 in its calendar should only be used with agreement of the parties involved.
    For input, only years 0:9999 are accepted.

  • %W
    Week of the year as decimal number (00โ€“53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.

  • %U
    Week of the year as decimal number (00โ€“53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.

But there is a problem: the output date object has to have a day defined. If you just try this the year will be read properly but the week is ignored as the day is unknown (filled as today instead):

strptime(c("202001", "202052"), format = "%Y%U")
#> [1] "2020-11-20 EST" "2020-11-20 EST"

So, one workaround is to pretend this is always the first day of the week:

strptime(c("202001_1", "202052_1"), format = "%Y%U_%u")
[1] "2020-01-06 EST" "2020-12-28 EST"

But you see you have a problem here: the week "1" is the one with the first Monday, so you won't have a week 53:

strptime(c("202001_1", "202052_1", "202053_1"), format = "%Y%U_%u")
#> [1] "2020-01-06 EST" "2020-12-28 EST" NA              
#> Warning message:
#> In strptime(c("202001_1", "202052_1", "202053_1"), format = "%Y%U_%u") :
#>   (0-based) yday 369 in year 2020 is invalid

This can be solved using the package ISOweek instead (see here too):

ISOweek::ISOweek2date("2020-W53-1")
#> [1] "2020-12-28"

Giving us finally the code:

library(tidyverse)

Mydf <- Mydf |>
  mutate(isoweek = str_replace(YearWeek,
                               "^(\\d{4})(\\d{2})$",
                               "\\1-W\\2-1"),
         date = ISOweek::ISOweek2date(isoweek))

head(Mydf)

#>   YearWeek Shipment Production    isoweek       date
#> 1   201901      418          0 2019-W01-1 2018-12-31
#> 2   201902     1442        198 2019-W02-1 2019-01-07
#> 3   201903     1115       1436 2019-W03-1 2019-01-14
#> 4   201904     1203       1055 2019-W04-1 2019-01-21
#> 5   201905     1192       1396 2019-W05-1 2019-01-28
#> 6   201906     1353       1330 2019-W06-1 2019-02-04

Of course during subsequent analyses you need to make sure that the artifact of having every week's activity on a single day is not a problem.

2 Likes

Thank you Alexis. It is highly helpful. One query though, how do we suppose to convert it into a time series object by keeping the same week numbers. Because we supposed to mention the frequency on the TS object(like frequency =52 or 52.18) for weekly data, is there any possibility for a Time series object to have 53 weeks in one year(2020) and 52 weeks in other years(2019.2021).

I have used the frequency as 52.18(365.25/7) on the time series object and when I checked the index week, it seems like it does consider 53 weeks for 2020 and 52 weeks for 2019,2020 without using the isoweek. Is this correct?

Mydf2<- Mydf[,c("Shipment","Production","Net.Production.Qty")]
Mydf.ts<-ts(Mydf2,frequency = 365.25/7,start = c(2019,1))
check<- as_tsibble(Shipment.ts) %>%
        mutate(Date.Week= as_date(index)) %>%
        as_tibble()
View(check)