Mutate a field that calculates average time between purchases

dplyr

#1

I am trying to calculate the best average result for a return purchase by product and then also by customer


library(readxl)
library(tidyverse)
library(lubridate)
library(reprex)

df <- read_excel("reprex.xlsx")

df$Date_Purchased <- as.Date(df$Date_Purchased)



df <- df %>%
  filter(!is.na(Product_ID)) %>% 
  select(Product_ID, Customer_ID, Date_Purchased) %>%
  group_by(Product_ID, Customer_ID) %>%
  arrange(Date_Purchased, .by_group = TRUE) %>% 
  mutate(previousPurchaseDate = lag(Date_Purchased)) %>% 
  mutate(previousPurchaseDate = case_when(
    is.na(previousPurchaseDate) ~ Date_Purchased,
    TRUE ~ lag(Date_Purchased))) %>% 
  mutate(difference = Date_Purchased - previousPurchaseDate) %>% 
  mutate(difference = as.numeric(difference)) 
head(df, n=10)

df_average <- df %>% 
  summarise(average = mean(difference, rm=TRUE)) 

head(df_average, n=10)

# A tibble: 10 x 5
# Groups:   Product_ID, Customer_ID [5]
   Product_ID Customer_ID Date_Purchased previousPurchaseDate difference
   <chr>            <dbl> <date>         <date>                    <dbl>
 1 Apple                1 2018-10-16     2018-10-16                    0
 2 Apple                1 2018-10-26     2018-10-16                   10
 3 Apple                2 2018-10-18     2018-10-18                    0
 4 Apple                2 2018-10-28     2018-10-18                   10
 5 Apple                3 2018-10-22     2018-10-22                    0
 6 Apple                3 2018-10-24     2018-10-22                    2
 7 Apple                3 2018-10-29     2018-10-24                    5
 8 Banana               1 2018-10-17     2018-10-17                    0
 9 Banana               1 2018-10-26     2018-10-17                    9
10 Banana               2 2018-10-18     2018-10-18                    0
> 
> df_average <- df %>% 
+   summarise(average = mean(difference, rm=TRUE)) 
> 
> head(df_average, n=10)
# A tibble: 9 x 3
# Groups:   Product_ID [3]
  Product_ID Customer_ID average
  <chr>            <dbl>   <dbl>
1 Apple                1    5   
2 Apple                2    5   
3 Apple                3    2.33
4 Banana               1    4.5 
5 Banana               2    5.5 
6 Banana               3    0   
7 Orange               1    1   
8 Orange               2    5.5 
9 Orange               3    6.5 



I think the geometric mean would be the best option, but I've been struggling to get it to work with lots of weird results.

#2

In the first snippet, I made all the difference == 0 into 1s

library(EnvStats)
> df_average <- df %>% group_by(Product_ID) %>% summarize(geo_mean = geoMean(difference))
> df_average
# A tibble: 2 x 2
  Product_ID geo_mean
  <fct>         <dbl>
1 Apple          2.68
2 Banana         2.08

Here's the original data with some zero difference values

> df0 <- as.tibble(read.csv("foo.csv"))
> df_average <- df0 %>% group_by(Product_ID) %>% summarize(geo_mean = geoMean(difference))
Warning messages:
1: In geoMean(difference) : Non-positive values in 'x'
2: In geoMean(difference) : Non-positive values in 'x'
> 

I think you'll need to censor the 0 values to be able to do geometric means.


#3

Thank you so much, I filtered out the 0 values which then only kept in those purchases that were repeat purchases and then the geometric mean calculation worked.
Really appreciate the help :slight_smile:


#4

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.