how to plot line with multiple years of data on single plot?

I have a data where there is a variable n for each date within a time range. The time range spans multiple years, including the current calendar year to date. My target is to keep a running sum of n for each year and the mean of the running sum for all years, not including the current year to date.

How can I plot this data such that my x-axis scale is a date (like month) and not a numeric day (1-365)?

# package libraries
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.2
#> Warning: package 'ggplot2' was built under R version 4.2.2
#> Warning: package 'tibble' was built under R version 4.2.2
#> Warning: package 'tidyr' was built under R version 4.2.2
#> Warning: package 'readr' was built under R version 4.2.2
#> Warning: package 'purrr' was built under R version 4.2.2
#> Warning: package 'dplyr' was built under R version 4.2.2
#> Warning: package 'stringr' was built under R version 4.2.2
#> Warning: package 'forcats' was built under R version 4.2.2
library(lubridate)
#> Warning: package 'lubridate' was built under R version 4.2.2
#> Loading required package: timechange
#> Warning: package 'timechange' was built under R version 4.2.2
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

# sample data
set.seed(1)

# daily count of an event happening n times each day
sample_time_series <- tibble(
  date = seq.Date( 
    from = date("2000-01-01"),
    to = date("2005-04-30"),
    by = "day"
  ),
  n = sample(
    x = seq(0, 4, 1),
    size = 1947,
    replace = TRUE
  )
)

# some data tidy and transform steps 
sample_time_series <- sample_time_series %>%
  mutate(
    # create new variables for date
    date_year = year(date),
    date_day = yday(date)
  ) %>%
  group_by(date_year) %>%
  summarise(
    # calculate cumulative sum for each year 
    date,
    date_day,
    n,
    n_cum = cumsum(n)
  ) %>%
  ungroup() %>%
  group_by(date_day) %>%
  summarise(
    # calculate mean of cumulative sum for all years 
    date_year,
    date,
    n,
    n_cum,
    n_mean = mean(n_cum)
  ) %>%
  ungroup() %>%
  arrange(date) %>%
  mutate(
    group = if_else(
      condition = date_year == 2005,
      true = "B", # historical
      false = "A" # year to date
    )
  )
#> `summarise()` has grouped output by 'date_year'. You can override using the
#> `.groups` argument.
#> `summarise()` has grouped output by 'date_day'. You can override using the
#> `.groups` argument.

# plot with three layers
ggplot() +
  geom_line(
    # cumulative sum for group A
    data = sample_time_series[sample_time_series$group == "A", ],
    mapping = aes(
      x = date_day,
      y = n_cum,
      group = as_factor(date_year)
    ), color = "#778da9",
    linewidth = 1,
    alpha = 3/5
  ) +
  # mean for group A
  geom_line(
    data = sample_time_series[sample_time_series$group == "A", ],
    mapping = aes(
      x = date_day,
      y = n_mean,
      group = as_factor(date_year)
    ), color = "#415a77",
    linewidth = 2,
    alpha = 3/5
  ) +
  geom_line(
    # cumulative sum for group B
    data = sample_time_series[sample_time_series$group == "B", ],
    mapping = aes(
      x = date_day,
      y = n_cum,
      group = as_factor(date_year)
    ), color = "#780000",
    linewidth = 2,
    alpha = 3/5
  ) # how does my x-scale become a date?

Created on 2023-05-15 with reprex v2.0.2

sample-plot

My approach would be to create a "dummy date" column that just sets the year for all the dates to the same year and plot that on the x-axis using scale_x_date() to display only months. It's a little funky to do inside a mutate(), but here's what my solution looks like for a simpler dataset:

library(tidyverse)
library(lubridate)

#create fake data
df <- 
  tibble(date = seq.Date(ymd("2000-01-01"), ymd("2003-12-31"), by = "day")) |> 
  mutate(x = rpois(n(), 10),
         x_cumsum = cumsum(x))

#wrangle
df <- 
  df |> 
  mutate(
    year = factor(year(date)),
    dummy_date =  {
      x <- date
      year(date) <- 2000
      x
    }
  )

ggplot(df, aes(x = dummy_date,  y = x_cumsum, color = year)) +
  geom_line() +
  scale_x_date(date_breaks = "2 months", date_labels = "%b")


1 Like

I found a solution on the net at r-bloggers. I've applied that to my example problem. In summary:

  • use mutate to create new variables in the data
  • create a new object in the environment that will populate the x-axis
  • use scale_x_continuous to change the x-axis labels
# package libraries
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.2.2
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.2
#> Warning: package 'readr' was built under R version 4.2.2
#> Warning: package 'purrr' was built under R version 4.2.2
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.2
#> Warning: package 'forcats' was built under R version 4.2.2
#> Warning: package 'lubridate' was built under R version 4.2.2
library(lubridate)

# sample data
set.seed(1)

# daily count of an event happening n times each day
sample_time_series <- tibble(
  date = seq.Date( 
    from = date("2000-01-01"),
    to = date("2005-04-30"),
    by = "day"
  ),
  n = sample(
    x = seq(0, 4, 1),
    size = 1947,
    replace = TRUE
  )
)

# transform the tibble to prep the data 
sample_time_series <- sample_time_series %>%
  mutate(
    # create new variables for date
    date_year = year(date),
    date_day = yday(date)
  ) %>%
  group_by(date_year) %>%
  summarise(
    # calculate cumulative sum for each year 
    date,
    date_day,
    n,
    n_cum = cumsum(n)
  ) %>%
  ungroup() %>%
  group_by(date_day) %>%
  summarise(
    # calculate mean of cumulative sum for all years 
    date_year,
    date,
    n,
    n_cum,
    n_mean = mean(n_cum)
  ) %>%
  ungroup() %>%
  arrange(date) %>%
  mutate(
    group = if_else(
      condition = date_year == 2005,
      true = "B", # historical 
      false = "A" # year to date
    )
  ) %>% 
  # this section, up to the plot, will create the objects needed for the x-axis ####
  # use mutate to create new variables for month and month-day
  mutate(
    month = month(date, label = TRUE),
    month_day = mday(date)
  )
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'date_year'. You can override using the
#> `.groups` argument.
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#>   always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'date_day'. You can override using the
#> `.groups` argument.

# create new environment object for plot's x-axis
month_names <- unique(sample_time_series$month)

# new object for plot's x-axis
day_of_year <- tibble(
  date = date(c(
    "2000-02-01",
    "2000-04-01",
    "2000-06-01",
    "2000-08-01",
    "2000-10-01"
  ))
)

# modify new object to match data 
day_of_year <- left_join(
  x = day_of_year,
  y = tibble(
    mon = month(sample_time_series$date, label = TRUE),
    doy = yday(sample_time_series$date),
    date = sample_time_series$date
  )
)
#> Joining with `by = join_by(date)`

# plot with three layers
ggplot() +
  geom_line(
    # cumulative sum for group A
    data = sample_time_series[sample_time_series$group == "A", ],
    mapping = aes(
      x = date_day,
      y = n_cum,
      group = as_factor(date_year)
    ), color = "#778da9",
    linewidth = 1,
    alpha = 3/5
  ) +
  # mean for group A
  geom_line(
    data = sample_time_series[sample_time_series$group == "A", ],
    mapping = aes(
      x = date_day,
      y = n_mean,
      group = as_factor(date_year)
    ), color = "#415a77",
    linewidth = 2,
    alpha = 3/5
  ) +
  geom_line(
    # cumulative sum for group B
    data = sample_time_series[sample_time_series$group == "B", ],
    mapping = aes(
      x = date_day,
      y = n_cum,
      group = as_factor(date_year)
    ), color = "#780000",
    linewidth = 2,
    alpha = 3/5
  ) +
  # change the axis from the default day to our new custom object
  scale_x_continuous(
    breaks = day_of_year$doy, labels = day_of_year$mon
  )

Created on 2023-06-05 with reprex v2.0.2

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.