Add columns using specific rows

meitei · March 16, 2021, 8:29pm

How can I add columns to a dataframe ( df1) from another dataframe (step2) in such a way

DateTime in df1 grouped by eventStart and eventEnd of step2 is used the given eventid, temp_tot and dur values.

library(readr)
library(tidyverse)

id <- "1IGcTYs_gSMxyiwPAQ4dP8UHoey4agTc3"
df1=read_csv(paste0("https://docs.google.com/uc?id=",id,"&export=download"),
              col_names = TRUE)
colnames(df1)=c("DateTime","rain")
step1 <- df1 %>% 
  mutate(rainflag = ifelse(rain > 0, 1, 0),
         rainlength = rep(rle(rainflag)$lengths, rle(rainflag)$lengths),
         eventflag = ifelse(rainflag == 1,1, ifelse(rainflag == 0 & rainlength < 25, 1,0 )),
         eventflag = ifelse(row_number() < 25 & rainflag == 0, 0, eventflag),
         eventid = rep(seq(1,length(rle(eventflag)$lengths)), rle(eventflag)$lengths))

step2 <- step1 %>% 
  filter(eventflag == 1) %>% 
  group_by(eventid) %>% 
  summarize(
    temp_tot = sum(rain),
    eventStart = first(DateTime),
    eventEnd = last(DateTime)) %>% 
  mutate(dur = as.numeric(difftime(eventEnd,eventStart, units = 'hour'))+0.25,)
step2=step2 %>%
  mutate(eventid=seq(1:nrow(step2)))

Expected outcome:

 DateTime             temp eventid temp_tot   dur
  <dttm>              <dbl>   <dbl>    <dbl> <dbl>
1 1984-01-10 01:30:00 0           1     14.5  5.25
2 1984-01-10 01:45:00 0.254       1     14.5  5.25
3 1984-01-10 02:00:00 0.254       1     14.5  5.25
4 1984-01-10 02:15:00 0.254       1     14.5  5.25
5 1984-01-10 02:30:00 0.254       1     14.5  5.25
6 1984-01-10 02:45:00 0.762       1     14.5  5.25

FJCC · March 16, 2021, 9:10pm

Is this close to what you want. Note that the first row has NAs becasue its DateTime is smaller than any eventStart in step2.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(readr)
library(fuzzyjoin)
id <- "1IGcTYs_gSMxyiwPAQ4dP8UHoey4agTc3"
df1=read_csv(paste0("https://docs.google.com/uc?id=",id,"&export=download"),
             col_names = TRUE)
#> Parsed with column specification:
#> cols(
#>   datetime = col_datetime(format = ""),
#>   VP = col_double()
#> )
colnames(df1)=c("DateTime","rain")
step1 <- df1 %>% 
  mutate(rainflag = ifelse(rain > 0, 1, 0),
         rainlength = rep(rle(rainflag)$lengths, rle(rainflag)$lengths),
         eventflag = ifelse(rainflag == 1,1, ifelse(rainflag == 0 & rainlength < 25, 1,0 )),
         eventflag = ifelse(row_number() < 25 & rainflag == 0, 0, eventflag),
         eventid = rep(seq(1,length(rle(eventflag)$lengths)), rle(eventflag)$lengths))

step2 <- step1 %>% 
  filter(eventflag == 1) %>% 
  group_by(eventid) %>% 
  summarize(
    temp_tot = sum(rain),
    eventStart = first(DateTime),
    eventEnd = last(DateTime)) %>% 
  mutate(dur = as.numeric(difftime(eventEnd,eventStart, units = 'hour'))+0.25,)
#> `summarise()` ungrouping output (override with `.groups` argument)
step2=step2 %>%
  mutate(eventid=seq(1:nrow(step2)))

dfSmall <- df1[1:100, ]
dfNew <- fuzzy_left_join(dfSmall, step2, by = c("DateTime" = "eventStart", 
                                            "DateTime" = "eventEnd"),
                         match_fun = list(`>=`, `<=`))
head(dfNew)
#> # A tibble: 6 x 7
#>   DateTime             rain eventid temp_tot eventStart         
#>   <dttm>              <dbl>   <int>    <dbl> <dttm>             
#> 1 1984-01-10 01:30:00 0          NA     NA   NA                 
#> 2 1984-01-10 01:45:00 0.254       1     14.5 1984-01-10 01:45:00
#> 3 1984-01-10 02:00:00 0.254       1     14.5 1984-01-10 01:45:00
#> 4 1984-01-10 02:15:00 0.254       1     14.5 1984-01-10 01:45:00
#> 5 1984-01-10 02:30:00 0.254       1     14.5 1984-01-10 01:45:00
#> 6 1984-01-10 02:45:00 0.762       1     14.5 1984-01-10 01:45:00
#> # ... with 2 more variables: eventEnd <dttm>, dur <dbl>

^{Created on 2021-03-16 by the reprex package (v0.3.0)}

Here is cleaner output of the relevant parts of dfNew

> head(dfNew[,c(1,2,3,4,7)])
# A tibble: 6 x 5
  DateTime             rain eventid temp_tot   dur
  <dttm>              <dbl>   <int>    <dbl> <dbl>
1 1984-01-10 01:30:00 0          NA     NA   NA   
2 1984-01-10 01:45:00 0.254       1     14.5  5.25
3 1984-01-10 02:00:00 0.254       1     14.5  5.25
4 1984-01-10 02:15:00 0.254       1     14.5  5.25
5 1984-01-10 02:30:00 0.254       1     14.5  5.25
6 1984-01-10 02:45:00 0.762       1     14.5  5.25

meitei · March 17, 2021, 10:59pm

Thanks, @FJCC for your reply.
I find myself, the left join for the whole dataset brings an error due to the large size of it which seems odd.
[Error: cannot allocate vector of size 13.0 Gb]
Could check for the whole dataset.

FJCC · March 18, 2021, 5:45am

I see the same error. I tried taking 1/75 of df1 (13981 rows) and doing the fuzzy_join between that and step2. That took about 10 minutes to run. If I use the first 13981 rows of df1 and filter step2 to contain startEvent dates within the range DateTimes in that reduced df1 data set, the join runs in about 10 seconds. That suggests a solution of running the joins on subsets of the data and then recombining the results afterwards.

system · April 8, 2021, 5:45am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.