 Disaggregate weekly data to daily but choosing the start date?

Happy Saturday everyone!

I have this weekly sale data and want to create a daily timeseries. The missing date would be given the same values using na.locf.

Question: can I assign the start date of week #1 to January 1st of the year and go from there?

Thank you!

library(zoo)
library(tidyverse)
library(lubridate)

DF <- structure(list(Year = c(1999, 2000), Wk01 = c(2.421, 3.39), Wk02 = c(3.518,
2.591), Wk03 = c(4.539, 2.211), Wk04 = c(3.251, 3.487), Wk05 = c(4.05,
3.908), Wk06 = c(4.129, 3.714), Wk07 = c(3.967, 3.481), Wk08 = c(1.317,
3.939), Wk09 = c(3.507, 3.77), Wk10 = c(3.235, 3.347), Wk11 = c(3.856,
4.607), Wk12 = c(4.9, 4.871), Wk13 = c(3.534, 6.704), Wk14 = c(3.673,
7.236), Wk15 = c(3.631, 6.559), Wk16 = c(4.789, 6.037), Wk17 = c(6.597,
7.838), Wk18 = c(4.542, 5.466), Wk19 = c(4.4, 5.692), Wk20 = c(5.935,
5.403), Wk21 = c(5.937, 5.56), Wk22 = c(11.755, 6.933), Wk23 = c(11.755,
11.005), Wk24 = c(10.484, 13.286), Wk25 = c(12.688, 10.646),
Wk26 = c(13.811, 12.303), Wk27 = c(29.954, 11.826), Wk28 = c(28.48,
8.961), Wk29 = c(37.812, 7.795), Wk30 = c(20.841, 7.813),
Wk31 = c(13.835, 5.609), Wk32 = c(15.241, 6.538), Wk33 = c(20.373,
6.046), Wk34 = c(11.184, 4.521), Wk35 = c(9.519, 5.489),
Wk36 = c(8.578, 7.567), Wk37 = c(7.154, 5.377), Wk38 = c(6.025,
5.541), Wk39 = c(6.655, 5.213), Wk40 = c(5.827, 5.072), Wk41 = c(5.841,
5.046), Wk42 = c(5.823, 4.59), Wk43 = c(5.607, 4.423), Wk44 = c(5.394,
4.27), Wk45 = c(5.359, 3.959), Wk46 = c(5.177, 3.621), Wk47 = c(5.976,
3.704), Wk48 = c(6.358, 3.242), Wk49 = c(5.025, 3.143), Wk50 = c(4.339,
1.786), Wk51 = c(4.228, 1.442), Wk52 = c(4.872, 3.225)), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"))
DF
#> # A tibble: 2 x 53
#>    Year  Wk01  Wk02  Wk03  Wk04  Wk05  Wk06  Wk07  Wk08  Wk09  Wk10  Wk11
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  1999  2.42  3.52  4.54  3.25  4.05  4.13  3.97  1.32  3.51  3.24  3.86
#> 2  2000  3.39  2.59  2.21  3.49  3.91  3.71  3.48  3.94  3.77  3.35  4.61
#> # ... with 41 more variables: Wk12 <dbl>, Wk13 <dbl>, Wk14 <dbl>,
#> #   Wk15 <dbl>, Wk16 <dbl>, Wk17 <dbl>, Wk18 <dbl>, Wk19 <dbl>,
#> #   Wk20 <dbl>, Wk21 <dbl>, Wk22 <dbl>, Wk23 <dbl>, Wk24 <dbl>,
#> #   Wk25 <dbl>, Wk26 <dbl>, Wk27 <dbl>, Wk28 <dbl>, Wk29 <dbl>,
#> #   Wk30 <dbl>, Wk31 <dbl>, Wk32 <dbl>, Wk33 <dbl>, Wk34 <dbl>,
#> #   Wk35 <dbl>, Wk36 <dbl>, Wk37 <dbl>, Wk38 <dbl>, Wk39 <dbl>,
#> #   Wk40 <dbl>, Wk41 <dbl>, Wk42 <dbl>, Wk43 <dbl>, Wk44 <dbl>,
#> #   Wk45 <dbl>, Wk46 <dbl>, Wk47 <dbl>, Wk48 <dbl>, Wk49 <dbl>,
#> #   Wk50 <dbl>, Wk51 <dbl>, Wk52 <dbl>

DF_lg <- DF %>%
gather(key = "WeekNr", value = "Sale", -Year) %>%
separate(WeekNr, into = c("dummy", "Week"),
sep = "k",
remove = FALSE) %>%
mutate(Date = as.Date(paste(Year, Week, 1, sep = "-"), "%Y-%U-%u")) %>%
arrange(Year) %>%
select(Date, Year, Week, Sale)
DF_lg
#> # A tibble: 104 x 4
#>    Date        Year Week   Sale
#>    <date>     <dbl> <chr> <dbl>
#>  1 1999-01-04  1999 01     2.42
#>  2 1999-01-11  1999 02     3.52
#>  3 1999-01-18  1999 03     4.54
#>  4 1999-01-25  1999 04     3.25
#>  5 1999-02-01  1999 05     4.05
#>  6 1999-02-08  1999 06     4.13
#>  7 1999-02-15  1999 07     3.97
#>  8 1999-02-22  1999 08     1.32
#>  9 1999-03-01  1999 09     3.51
#> 10 1999-03-08  1999 10     3.24
#> # ... with 94 more rows

# create a daily date data frame
start_date <- as.Date("1999-01-01")
end_date <- as.Date("2000-12-31")
daily_vec <- tibble(Date = seq(start_date,
end_date,
by = 'day'))

# merge then do interpolation
DF_daily <- DF_lg %>%
right_join(daily_vec, by = 'Date') %>%
na.locf(na.rm = FALSE)
DF_daily
#> # A tibble: 731 x 4
#>    Date        Year Week   Sale
#>    <date>     <dbl> <chr> <dbl>
#>  1 1999-01-01    NA <NA>  NA
#>  2 1999-01-02    NA <NA>  NA
#>  3 1999-01-03    NA <NA>  NA
#>  4 1999-01-04  1999 01     2.42
#>  5 1999-01-05  1999 01     2.42
#>  6 1999-01-06  1999 01     2.42
#>  7 1999-01-07  1999 01     2.42
#>  8 1999-01-08  1999 01     2.42
#>  9 1999-01-09  1999 01     2.42
#> 10 1999-01-10  1999 01     2.42
#> # ... with 721 more rows

May be it is too obvious, but I fail to understand what you're trying to do here. Actually not what, but why. Can you please clarify?

Also, I do not follow where do you want to use this.

Most probably you can't in base R, as 1st January need not be in week 1, rather in week 0 (see ?strptime), but if you can provide your objective, maybe it can be solved in some other approach.

Edit: You may be interested in this SO thread (and the linked thread in the accepted answer):

1 Like

You can simplify the creation of DF_lg a little using str_sub like this:

DF_lg <- DF %>%
gather(key="Week", value="Sale", -Year) %>%
mutate(
Week = str_sub(Week, 3, 4),
Date = as.Date(paste(Year, Week, 1, sep="-"), "%Y-%U-%u")
) %>%
select(Date, Week, Sale) %>%
arrange(Date)

But to answer your question, the lead function from dplyr will help do what you want:

DF_daily <- DF_lg %>% right_join(
tibble(Date = seq(as_date("1999-01-01"), as_date("2000-12-31"), by="day")),
by="Date") %>%
na.locf(na.rm=FALSE) %>%
mutate(Sale = lead(Sale, n=3)) %>%
select(Date, Sale)
1 Like

Thank you @robjhyndman! This is very close. There are still some problems with the last days of every year (see the new longer example below). Is there a way to chose the n parameter of the lead function automatically?

library(zoo)
library(tidyverse)
library(lubridate)

DF <- structure(list(Year = c(1998, 1999, 2000), Wk01 = c(3.39, 2.421,
3.39), Wk02 = c(2.591, 3.518, 2.591), Wk03 = c(2.211, 4.539,
2.211), Wk04 = c(3.487, 3.251, 3.487), Wk05 = c(3.908, 4.05,
3.908), Wk06 = c(3.714, 4.129, 3.714), Wk07 = c(3.481, 3.967,
3.481), Wk08 = c(3.939, 1.317, 3.939), Wk09 = c(3.77, 3.507,
3.77), Wk10 = c(3.347, 3.235, 3.347), Wk11 = c(4.607, 3.856,
4.607), Wk12 = c(4.871, 4.9, 4.871), Wk13 = c(6.704, 3.534, 6.704
), Wk14 = c(7.236, 3.673, 7.236), Wk15 = c(6.559, 3.631, 6.559
), Wk16 = c(6.037, 4.789, 6.037), Wk17 = c(7.838, 6.597, 7.838
), Wk18 = c(5.466, 4.542, 5.466), Wk19 = c(5.692, 4.4, 5.692),
Wk20 = c(5.403, 5.935, 5.403), Wk21 = c(5.56, 5.937, 5.56
), Wk22 = c(6.933, 11.755, 6.933), Wk23 = c(11.005, 11.755,
11.005), Wk24 = c(13.286, 10.484, 13.286), Wk25 = c(10.646,
12.688, 10.646), Wk26 = c(12.303, 13.811, 12.303), Wk27 = c(11.826,
29.954, 11.826), Wk28 = c(8.961, 28.48, 8.961), Wk29 = c(7.795,
37.812, 7.795), Wk30 = c(7.813, 20.841, 7.813), Wk31 = c(5.609,
13.835, 5.609), Wk32 = c(6.538, 15.241, 6.538), Wk33 = c(6.046,
20.373, 6.046), Wk34 = c(4.521, 11.184, 4.521), Wk35 = c(5.489,
9.519, 5.489), Wk36 = c(7.567, 8.578, 7.567), Wk37 = c(5.377,
7.154, 5.377), Wk38 = c(5.541, 6.025, 5.541), Wk39 = c(5.213,
6.655, 5.213), Wk40 = c(5.072, 5.827, 5.072), Wk41 = c(5.046,
5.841, 5.046), Wk42 = c(4.59, 5.823, 4.59), Wk43 = c(4.423,
5.607, 4.423), Wk44 = c(4.27, 5.394, 4.27), Wk45 = c(3.959,
5.359, 3.959), Wk46 = c(3.621, 5.177, 3.621), Wk47 = c(3.704,
5.976, 3.704), Wk48 = c(3.242, 6.358, 3.242), Wk49 = c(3.143,
5.025, 3.143), Wk50 = c(1.786, 4.339, 1.786), Wk51 = c(1.442,
4.228, 1.442), Wk52 = c(3.225, 4.872, 3.225)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))

DF_lg <- DF %>%
gather(key = "WeekNr", value = "Sale", -Year) %>%
separate(WeekNr, into = c("dummy", "Week"),
sep = "k",
remove = FALSE) %>%
mutate(Date = as.Date(paste(Year, Week, 1, sep = "-"), "%Y-%U-%u")) %>%
arrange(Year) %>%
select(Date, Year, Week, Sale)
DF_lg
#> # A tibble: 156 x 4
#>    Date        Year Week   Sale
#>    <date>     <dbl> <chr> <dbl>
#>  1 1998-01-05  1998 01     3.39
#>  2 1998-01-12  1998 02     2.59
#>  3 1998-01-19  1998 03     2.21
#>  4 1998-01-26  1998 04     3.49
#>  5 1998-02-02  1998 05     3.91
#>  6 1998-02-09  1998 06     3.71
#>  7 1998-02-16  1998 07     3.48
#>  8 1998-02-23  1998 08     3.94
#>  9 1998-03-02  1998 09     3.77
#> 10 1998-03-09  1998 10     3.35
#> # ... with 146 more rows

DF_lg2 <- DF %>%
gather(key="Week", value="Sale", -Year) %>%
mutate(
Week = str_sub(Week, 3, 4),
Date = as.Date(paste(Year, Week, 1, sep="-"), "%Y-%U-%u")
) %>%
select(Date, Week, Sale) %>%
arrange(Date)

# create a daily date
start_date <- min(as.Date(paste(DF\$Year, "1", "1", sep = "-")))
end_date <- max(as.Date(paste(DF\$Year, "12", "31", sep = "-")))
daily_vec <- tibble(Date = seq(start_date,
end_date,
by = 'day'))

# Can we automate the number of `lead` e.g., `n=xxx`
DF_daily2 <- DF_lg %>% right_join(
daily_vec,
by="Date") %>%
na.locf(na.rm=FALSE) %>%
mutate(Sale = lead(Sale, n=4)) %>%
select(Date, Sale)
DF_daily2
#> # A tibble: 1,096 x 2
#>    Date        Sale
#>    <date>     <dbl>
#>  1 1998-01-01  3.39
#>  2 1998-01-02  3.39
#>  3 1998-01-03  3.39
#>  4 1998-01-04  3.39
#>  5 1998-01-05  3.39
#>  6 1998-01-06  3.39
#>  7 1998-01-07  3.39
#>  8 1998-01-08  2.59
#>  9 1998-01-09  2.59
#> 10 1998-01-10  2.59
#> # ... with 1,086 more rows

# Problem with the last days of the year
DF_daily2 %>%
filter(Date >= daily_vec\$Date & Date <= daily_vec\$Date)
#> # A tibble: 5 x 2
#>   Date        Sale
#>   <date>     <dbl>
#> 1 1998-12-29  3.22
#> 2 1998-12-30  3.22
#> 3 1998-12-31  2.42
#> 4 1999-01-01  2.42
#> 5 1999-01-02  2.42

DF_daily2 %>%
filter(Date >= daily_vec\$Date & Date <= daily_vec\$Date)
#> # A tibble: 5 x 2
#>   Date        Sale
#>   <date>     <dbl>
#> 1 1999-12-28  4.87
#> 2 1999-12-29  4.87
#> 3 1999-12-30  3.39
#> 4 1999-12-31  3.39
#> 5 2000-01-01  3.39

DF_daily2 %>%
tail()
#> # A tibble: 6 x 2
#>   Date        Sale
#>   <date>     <dbl>
#> 1 2000-12-26  3.22
#> 2 2000-12-27  3.22
#> 3 2000-12-28 NA
#> 4 2000-12-29 NA
#> 5 2000-12-30 NA
#> 6 2000-12-31 NA

Thank you @Yarnabrina! All I want to do is to make a daily data starting from 01/01 based on 52 weeks of data. Here I would consider week 01 starts from Jan 01 to Jan 07 and so on. This is what my boss wanted @RuReady, based on your previous data, here's a solution, with the problem that I considered the last week to be of length 8, to account for 31^{st} December, since for non-leap years, 52^{nd} week (as per your definition) will start on 24^{th} December. Is that okay? I didn't follow your problem very well, sorry!

library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#>     filter, lag
#> The following objects are masked from 'package:base':
#>
#>     intersect, setdiff, setequal, union
library(tidyr)

DF <- structure(.Data = list(Year = c(1999, 2000),
Wk01 = c(2.421, 3.39),
Wk02 = c(3.518, 2.591),
Wk03 = c(4.539, 2.211),
Wk04 = c(3.251, 3.487),
Wk05 = c(4.05, 3.908),
Wk06 = c(4.129, 3.714),
Wk07 = c(3.967, 3.481),
Wk08 = c(1.317, 3.939),
Wk09 = c(3.507, 3.77),
Wk10 = c(3.235, 3.347),
Wk11 = c(3.856, 4.607),
Wk12 = c(4.9, 4.871),
Wk13 = c(3.534, 6.704),
Wk14 = c(3.673, 7.236),
Wk15 = c(3.631, 6.559),
Wk16 = c(4.789, 6.037),
Wk17 = c(6.597, 7.838),
Wk18 = c(4.542, 5.466),
Wk19 = c(4.4, 5.692),
Wk20 = c(5.935, 5.403),
Wk21 = c(5.937, 5.56),
Wk22 = c(11.755, 6.933),
Wk23 = c(11.755, 11.005),
Wk24 = c(10.484, 13.286),
Wk25 = c(12.688, 10.646),
Wk26 = c(13.811, 12.303),
Wk27 = c(29.954, 11.826),
Wk28 = c(28.48, 8.961),
Wk29 = c(37.812, 7.795),
Wk30 = c(20.841, 7.813),
Wk31 = c(13.835, 5.609),
Wk32 = c(15.241, 6.538),
Wk33 = c(20.373, 6.046),
Wk34 = c(11.184, 4.521),
Wk35 = c(9.519, 5.489),
Wk36 = c(8.578, 7.567),
Wk37 = c(7.154, 5.377),
Wk38 = c(6.025, 5.541),
Wk39 = c(6.655, 5.213),
Wk40 = c(5.827, 5.072),
Wk41 = c(5.841, 5.046),
Wk42 = c(5.823, 4.59),
Wk43 = c(5.607, 4.423),
Wk44 = c(5.394, 4.27),
Wk45 = c(5.359, 3.959),
Wk46 = c(5.177, 3.621),
Wk47 = c(5.976, 3.704),
Wk48 = c(6.358, 3.242),
Wk49 = c(5.025, 3.143),
Wk50 = c(4.339, 1.786),
Wk51 = c(4.228, 1.442),
Wk52 = c(4.872, 3.225)),
row.names = c(NA, -2L),
class = c("tbl_df", "tbl", "data.frame"))

DF %>%
pivot_longer(cols = starts_with(match = "Wk"),
names_to = "WeekNumber",
names_prefix = "Wk0*",
names_ptypes = list(WeekNumber = integer()),
values_to = "Sale") %>%
group_by(Year) %>%
mutate(Date = as.Date(x = paste(Year, WeekNumber, 1),
format = "%Y %U %u") - (7- as.POSIXlt(x = paste0(Year, "-01-01"),
format = "%Y-%m-%d")\$wday + 1)) %>%
select(-WeekNumber) %>%
complete(Date = full_seq(x = as.Date(x = paste0(Year, c("-01-01", "-12-31")),
format = "%Y-%m-%d"),
period = 1)) %>%
fill(Sale,
.direction = "down") %>%
ungroup()
#> # A tibble: 731 x 3
#>     Year Date        Sale
#>    <dbl> <date>     <dbl>
#>  1  1999 1999-01-01  2.42
#>  2  1999 1999-01-02  2.42
#>  3  1999 1999-01-03  2.42
#>  4  1999 1999-01-04  2.42
#>  5  1999 1999-01-05  2.42
#>  6  1999 1999-01-06  2.42
#>  7  1999 1999-01-07  2.42
#>  8  1999 1999-01-08  3.52
#>  9  1999 1999-01-09  3.52
#> 10  1999 1999-01-10  3.52
#> # ... with 721 more rows

Created on 2019-09-15 by the reprex package (v0.3.0)

1 Like

Here is a more automated solution.

DF_lg <- DF %>%
gather(key = "Week", value = "Sale", -Year) %>%
mutate(
Week = str_sub(Week, 3, 4),
Date = as.Date(paste(Year, Week, 1, sep = "-"), "%Y-%U-%u")
) %>%
select(Date, Week, Sale) %>%
arrange(Date)

start_date <- as_date(paste0(min(DF\$Year), "-01-01"))
end_date <- as_date(paste0(max(DF\$Year), "-12-31"))
gap <- as.numeric(min(DF_lg\$Date) - start_date)

DF_daily <- DF_lg %>%
right_join(
tibble(Date = seq(start_date, end_date + 7, by = "day")),
by = "Date"
) %>%
na.locf(na.rm = FALSE) %>%
mutate(Sale = lead(Sale, n = gap)) %>%
select(Date, Sale) %>%
filter(Date <= end_date)
2 Likes

Thank you @robjhyndman! This still didn't solve the problem with the last few days of each year

library(zoo)
library(tidyverse)
library(lubridate)

DF <- structure(list(Year = c(1998, 1999, 2000), Wk01 = c(3.39, 2.421,
3.39), Wk02 = c(2.591, 3.518, 2.591), Wk03 = c(2.211, 4.539,
2.211), Wk04 = c(3.487, 3.251, 3.487), Wk05 = c(3.908, 4.05,
3.908), Wk06 = c(3.714, 4.129, 3.714), Wk07 = c(3.481, 3.967,
3.481), Wk08 = c(3.939, 1.317, 3.939), Wk09 = c(3.77, 3.507,
3.77), Wk10 = c(3.347, 3.235, 3.347), Wk11 = c(4.607, 3.856,
4.607), Wk12 = c(4.871, 4.9, 4.871), Wk13 = c(6.704, 3.534, 6.704
), Wk14 = c(7.236, 3.673, 7.236), Wk15 = c(6.559, 3.631, 6.559
), Wk16 = c(6.037, 4.789, 6.037), Wk17 = c(7.838, 6.597, 7.838
), Wk18 = c(5.466, 4.542, 5.466), Wk19 = c(5.692, 4.4, 5.692),
Wk20 = c(5.403, 5.935, 5.403), Wk21 = c(5.56, 5.937, 5.56
), Wk22 = c(6.933, 11.755, 6.933), Wk23 = c(11.005, 11.755,
11.005), Wk24 = c(13.286, 10.484, 13.286), Wk25 = c(10.646,
12.688, 10.646), Wk26 = c(12.303, 13.811, 12.303), Wk27 = c(11.826,
29.954, 11.826), Wk28 = c(8.961, 28.48, 8.961), Wk29 = c(7.795,
37.812, 7.795), Wk30 = c(7.813, 20.841, 7.813), Wk31 = c(5.609,
13.835, 5.609), Wk32 = c(6.538, 15.241, 6.538), Wk33 = c(6.046,
20.373, 6.046), Wk34 = c(4.521, 11.184, 4.521), Wk35 = c(5.489,
9.519, 5.489), Wk36 = c(7.567, 8.578, 7.567), Wk37 = c(5.377,
7.154, 5.377), Wk38 = c(5.541, 6.025, 5.541), Wk39 = c(5.213,
6.655, 5.213), Wk40 = c(5.072, 5.827, 5.072), Wk41 = c(5.046,
5.841, 5.046), Wk42 = c(4.59, 5.823, 4.59), Wk43 = c(4.423,
5.607, 4.423), Wk44 = c(4.27, 5.394, 4.27), Wk45 = c(3.959,
5.359, 3.959), Wk46 = c(3.621, 5.177, 3.621), Wk47 = c(3.704,
5.976, 3.704), Wk48 = c(3.242, 6.358, 3.242), Wk49 = c(3.143,
5.025, 3.143), Wk50 = c(1.786, 4.339, 1.786), Wk51 = c(1.442,
4.228, 1.442), Wk52 = c(3.225, 4.872, 3.225)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))

DF_lg <- DF %>%
gather(key = "Week", value = "Sale", -Year) %>%
mutate(
Week = str_sub(Week, 3, 4),
Date = as.Date(paste(Year, Week, 1, sep = "-"), "%Y-%U-%u")
) %>%
select(Date, Week, Sale) %>%
arrange(Date)

start_date <- as_date(paste0(min(DF\$Year), "-01-01"))
end_date <- as_date(paste0(max(DF\$Year), "-12-31"))
daily_vec <- tibble(Date = seq(start_date,
end_date,
by = 'day'))
gap <- as.numeric(min(DF_lg\$Date) - start_date)

DF_daily <- DF_lg %>%
right_join(
tibble(Date = seq(start_date, end_date + 7, by = "day")),
by = "Date"
) %>%
na.locf(na.rm = FALSE) %>%
mutate(Sale = lead(Sale, n = gap)) %>%
select(Date, Sale) %>%
filter(Date <= end_date)

DF_daily %>%
filter(Date >= daily_vec\$Date & Date <= daily_vec\$Date)
#> # A tibble: 5 x 2
#>   Date        Sale
#>   <date>     <dbl>
#> 1 1998-12-29  3.22
#> 2 1998-12-30  3.22
#> 3 1998-12-31  2.42
#> 4 1999-01-01  2.42
#> 5 1999-01-02  2.42

DF_daily %>%
filter(Date >= daily_vec\$Date & Date <= daily_vec\$Date)
#> # A tibble: 5 x 2
#>   Date        Sale
#>   <date>     <dbl>
#> 1 1999-12-28  4.87
#> 2 1999-12-29  4.87
#> 3 1999-12-30  3.39
#> 4 1999-12-31  3.39
#> 5 2000-01-01  3.39

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.