Wrangling Data to add rows for each year and month

Hi all! I am trying to manipulate a data set to add rows for each unique customers so that every date has an observation for each unique customer even if the observation is blank. Below is my original dataset:

I just do not know where to even start to have a new dataset be produced in RStudio where each customer will have a row from Jan 2019 to Dec 2021 even if they do not have an observation in the original dataset. In the final product if the original dataset had no value for a month it would just be blank.

Here is part of a screen shot of what I want the final product to look like:

Its so frustrating I don't even know where to start with this. Thanks ahead of time!

Here is the Dataset for RStudio:
Original:
structure(list(Cust = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "c", "c", "c", "c", "c", "d", "e", "e", "e", "f", "f", "f",
"f", "f", "f"), Year = c(2020, 2020, 2020, 2021, 2019, 2020,
2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 2019, 2019,
2021, 2020, 2020, 2020, 2021, 2021, 2021), Month = c(2, 4, 7,
8, 1, 1, 2, 8, 11, 12, 1, 4, 5, 7, 12, 11, 12, 10, 7, 8, 10,
1, 2, 7), Value = c(17.1132392563811, 82.4624060030711, 7.36487637522714,
4.37340608492061, 52.7114539604453, 17.502410936385, 60.6250668385648,
13.7213856071386, 49.7832135247854, 74.5374119415543, 86.1627274135591,
34.3547763153967, 0.496440083511063, 73.9036004776085, 76.5231975529228,
16.930364987526, 84.0237708007239, 86.5041586423134, 53.5592603976753,
3.6294916413718, 7.3047081585912, 65.5391278724964, 50.3107515153293,
97.3149369209477)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))

Goal:
structure(list(Year = c(2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021
), Month = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7,
7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10,
10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8,
8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 11,
11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5,
5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
11, 12, 12, 12, 12, 12, 12), Cust = c("a", "b", "c", "d", "e",
"f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f",
"a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a",
"b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b",
"c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c",
"d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d",
"e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e",
"f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f",
"a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a",
"b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b",
"c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c",
"d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d",
"e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e",
"f", "a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f",
"a", "b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a",
"b", "c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b",
"c", "d", "e", "f", "a", "b", "c", "d", "e", "f", "a", "b", "c",
"d", "e", "f"), Values = c(NA, 52.7114539604453, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 16.930364987526,
NA, NA, NA, NA, NA, 84.0237708007239, NA, NA, 17.502410936385,
NA, NA, NA, NA, 17.1132392563811, 60.6250668385648, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 82.4624060030711, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7.36487637522714,
NA, NA, NA, NA, 53.5592603976753, NA, 13.7213856071386, NA, NA,
NA, 3.6294916413718, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 7.3047081585912, NA, 49.7832135247854, NA, NA, NA, NA, NA,
NA, 74.5374119415543, NA, NA, NA, NA, NA, 86.1627274135591, NA,
NA, 65.5391278724964, NA, NA, NA, NA, NA, 50.3107515153293, NA,
NA, NA, NA, NA, NA, NA, NA, 34.3547763153967, NA, NA, NA, NA,
NA, 0.496440083511063, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 73.9036004776085, NA, NA, 97.3149369209477, 4.37340608492061,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 86.5041586423134,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 76.5231975529228, NA,
NA)), row.names = c(NA, -216L), class = c("tbl_df", "tbl", "data.frame"
))

Use expand.grid to make all combinations, add to original table and summarize :

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df <-structure(list(Cust = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "c", "c", "c", "c", "c", "d", "e", "e", "e", "f", "f", "f",
"f", "f", "f"), Year = c(2020, 2020, 2020, 2021, 2019, 2020,
2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 2019, 2019,
2021, 2020, 2020, 2020, 2021, 2021, 2021), Month = c(2, 4, 7,
8, 1, 1, 2, 8, 11, 12, 1, 4, 5, 7, 12, 11, 12, 10, 7, 8, 10,
1, 2, 7), Value = c(17.1132392563811, 82.4624060030711, 7.36487637522714,
4.37340608492061, 52.7114539604453, 17.502410936385, 60.6250668385648,
13.7213856071386, 49.7832135247854, 74.5374119415543, 86.1627274135591,
34.3547763153967, 0.496440083511063, 73.9036004776085, 76.5231975529228,
16.930364987526, 84.0237708007239, 86.5041586423134, 53.5592603976753,
3.6294916413718, 7.3047081585912, 65.5391278724964, 50.3107515153293,
97.3149369209477)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))

allcombs = expand.grid(
  Cust = unique(df$Cust),
  Year = 2019:2021,
  Month = 1:12,
  Value = 0,
  stringsAsFactors = F
)

df2 <- rbind(allcombs,df) %>%
  group_by(Cust,Year,Month) %>%
  summarize(Value = sum(Value)) %>%
  ungroup()
#> `summarise()` has grouped output by 'Cust', 'Year'. You can override using the `.groups` argument.

head(df2,20)
#> # A tibble: 20 x 4
#>    Cust   Year Month Value
#>    <chr> <dbl> <dbl> <dbl>
#>  1 a      2019     1  0   
#>  2 a      2019     2  0   
#>  3 a      2019     3  0   
#>  4 a      2019     4  0   
#>  5 a      2019     5  0   
#>  6 a      2019     6  0   
#>  7 a      2019     7  0   
#>  8 a      2019     8  0   
#>  9 a      2019     9  0   
#> 10 a      2019    10  0   
#> 11 a      2019    11  0   
#> 12 a      2019    12  0   
#> 13 a      2020     1  0   
#> 14 a      2020     2 17.1 
#> 15 a      2020     3  0   
#> 16 a      2020     4 82.5 
#> 17 a      2020     5  0   
#> 18 a      2020     6  0   
#> 19 a      2020     7  7.36
#> 20 a      2020     8  0
Created on 2021-09-02 by the reprex package (v2.0.0)

The complete function fro tidyr may also do what you need.

library(tidyr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
DF <- structure(list(Cust = c("a", "a", "a", "a", "b", "b", "b", "b",
                              "b", "c", "c", "c", "c", "c", "d", "e", "e", "e", "f", "f", "f",
                              "f", "f", "f"), 
                     Year = c(2020, 2020, 2020, 2021, 2019, 2020,
                              2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 2019, 2019,
                               2021, 2020, 2020, 2020, 2021, 2021, 2021), 
                     Month = c(2, 4, 7,8, 1, 1, 2, 8, 11, 12, 1, 4, 5, 7, 12, 11, 12, 10, 7, 8, 10,1, 2, 7), 
                     Value = c(17.1132392563811, 82.4624060030711, 7.36487637522714,4.37340608492061, 52.7114539604453, 17.502410936385, 60.6250668385648,
                               13.7213856071386, 49.7832135247854, 74.5374119415543, 86.1627274135591,34.3547763153967, 0.496440083511063, 73.9036004776085, 76.5231975529228,
                              16.930364987526, 84.0237708007239, 86.5041586423134, 53.5592603976753,3.6294916413718, 7.3047081585912, 65.5391278724964, 50.3107515153293,
                              97.3149369209477)), 
                row.names = c(NA, -24L), class = c("tbl_df",
                                                                                                                                                                                       "tbl", "data.frame"))
DF <- DF %>% mutate(Month = factor(Month, levels = 1:12))
DFcomplete <- DF %>% complete(Cust, Year, Month)

Created on 2021-09-02 by the reprex package (v0.3.0)

2 Likes

Thank you! This worked!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.