Is this close to what you want. Note that the first row has NAs becasue its DateTime is smaller than any eventStart in step2.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(readr)
library(fuzzyjoin)
id <- "1IGcTYs_gSMxyiwPAQ4dP8UHoey4agTc3"
df1=read_csv(paste0("https://docs.google.com/uc?id=",id,"&export=download"),
col_names = TRUE)
#> Parsed with column specification:
#> cols(
#> datetime = col_datetime(format = ""),
#> VP = col_double()
#> )
colnames(df1)=c("DateTime","rain")
step1 <- df1 %>%
mutate(rainflag = ifelse(rain > 0, 1, 0),
rainlength = rep(rle(rainflag)$lengths, rle(rainflag)$lengths),
eventflag = ifelse(rainflag == 1,1, ifelse(rainflag == 0 & rainlength < 25, 1,0 )),
eventflag = ifelse(row_number() < 25 & rainflag == 0, 0, eventflag),
eventid = rep(seq(1,length(rle(eventflag)$lengths)), rle(eventflag)$lengths))
step2 <- step1 %>%
filter(eventflag == 1) %>%
group_by(eventid) %>%
summarize(
temp_tot = sum(rain),
eventStart = first(DateTime),
eventEnd = last(DateTime)) %>%
mutate(dur = as.numeric(difftime(eventEnd,eventStart, units = 'hour'))+0.25,)
#> `summarise()` ungrouping output (override with `.groups` argument)
step2=step2 %>%
mutate(eventid=seq(1:nrow(step2)))
dfSmall <- df1[1:100, ]
dfNew <- fuzzy_left_join(dfSmall, step2, by = c("DateTime" = "eventStart",
"DateTime" = "eventEnd"),
match_fun = list(`>=`, `<=`))
head(dfNew)
#> # A tibble: 6 x 7
#> DateTime rain eventid temp_tot eventStart
#> <dttm> <dbl> <int> <dbl> <dttm>
#> 1 1984-01-10 01:30:00 0 NA NA NA
#> 2 1984-01-10 01:45:00 0.254 1 14.5 1984-01-10 01:45:00
#> 3 1984-01-10 02:00:00 0.254 1 14.5 1984-01-10 01:45:00
#> 4 1984-01-10 02:15:00 0.254 1 14.5 1984-01-10 01:45:00
#> 5 1984-01-10 02:30:00 0.254 1 14.5 1984-01-10 01:45:00
#> 6 1984-01-10 02:45:00 0.762 1 14.5 1984-01-10 01:45:00
#> # ... with 2 more variables: eventEnd <dttm>, dur <dbl>
Created on 2021-03-16 by the reprex package (v0.3.0)
Here is cleaner output of the relevant parts of dfNew
> head(dfNew[,c(1,2,3,4,7)])
# A tibble: 6 x 5
DateTime rain eventid temp_tot dur
<dttm> <dbl> <int> <dbl> <dbl>
1 1984-01-10 01:30:00 0 NA NA NA
2 1984-01-10 01:45:00 0.254 1 14.5 5.25
3 1984-01-10 02:00:00 0.254 1 14.5 5.25
4 1984-01-10 02:15:00 0.254 1 14.5 5.25
5 1984-01-10 02:30:00 0.254 1 14.5 5.25
6 1984-01-10 02:45:00 0.762 1 14.5 5.25