Faster merging of date/time data

I have a data set with 733k rows and am merging data as per:
irus_data$date_time <- as.POSIXct(paste(irus_data$Date, irus_data$Time))

But this currently takes about a minute or so (have to work from home and my computer is not high performance!)

Any suggestions on speeding it up?

as.POSIXct() tends to be slow. To provide advice it would help if you showed some sample data.

Also consider whether you need to convert it or whether you can keep it as a character.

I'm sorry to be so useless but I am v new to this - how do I get a sample of my data from RStudio into here?

Copy the output from this:

dput(head(irus_data))

Like this?

structure(list(Date = structure(c(18281, 18281, 18281, 18281,
18281, 18281), class = "Date"), Time = structure(c(0, 3600, 7200,
10800, 14400, 18000), class = c("hms", "difftime"), units = "secs"),
Site = c("WARNEFORD", "WARNEFORD", "WARNEFORD", "WARNEFORD",
"WARNEFORD", "WARNEFORD"), Address = c(111, 111, 111, 111,
111, 111), Name = c("A01-A", "A01-A", "A01-A", "A01-A", "A01-A",
"A01-A"), Type = c("Room Heater", "Room Heater", "Room Heater",
"Room Heater", "Room Heater", "Room Heater"), Setpoint = c(21,
21, 20.1, 19, 19, 19), Temp Air = c(21, 21, 20.7, 19.5,
19, 19.2), L2On = c(0.6, 0.5, 0.3, 0, 0, 0.1), L2kWh (Est) = c(0.93,
0.71, 0.39, 0, 0, 0.18), site_room = c("WARNEFORDA01-A",
"WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A",
"WARNEFORDA01-A"), date_time = structure(c(1579478400, 1579482000,
1579485600, 1579489200, 1579492800, 1579496400), tzone = "", class = c("POSIXct",
"POSIXt")), avg_setpoint_before = c(19.6544642857143, 19.6544642857143,
19.6544642857143, 19.6544642857143, 19.6544642857143, 19.6544642857143
), avg_setpoint_after = c(19.8759615384615, 19.8759615384615,
19.8759615384615, 19.8759615384615, 19.8759615384615, 19.8759615384615
), sub19_before = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), sub19_after = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), avg_sp_before_excdflt = c(20.2888888888889,
20.2888888888889, 20.2888888888889, 20.2888888888889, 20.2888888888889,
20.2888888888889), avg_sp_after_excdflt = c(20.5075, 20.5075,
20.5075, 20.5075, 20.5075, 20.5075), thermo_change = c(0.221497252747252,
0.221497252747252, 0.221497252747252, 0.221497252747252,
0.221497252747252, 0.221497252747252), sub19_change = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), thermo_change_excdflt = c(0.218611111111112,
0.218611111111112, 0.218611111111112, 0.218611111111112,
0.218611111111112, 0.218611111111112)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))

The above data included spaces in variable names and lacked appropriate backticks to be functional. Here is cleaned up for anyone that wants to use it

structure(list(Date = structure(c(18281, 18281, 18281, 18281, 18281, 18281), class = "Date"), 
 Time = structure(c(0, 3600, 7200,  10800, 14400, 18000), class = c("hms", "difftime"), units = "secs"),
 Site = c("WARNEFORD", "WARNEFORD", "WARNEFORD", "WARNEFORD", "WARNEFORD", "WARNEFORD"),
 Address = c(111, 111, 111, 111,111, 111),
 Name = c("A01-A", "A01-A", "A01-A", "A01-A", "A01-A","A01-A"), 
 Type = c("Room Heater", "Room Heater", "Room Heater","Room Heater", "Room Heater", "Room Heater"),
 Setpoint = c(21, 21, 20.1, 19, 19, 19), 
 `Temp Air` = c(21, 21, 20.7, 19.5, 19, 19.2),
 L2On = c(0.6, 0.5, 0.3, 0, 0, 0.1),
 `L2kWh (Est)` = c(0.93, 0.71, 0.39, 0, 0, 0.18),
 site_room = c("WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A", "WARNEFORDA01-A"),
 date_time = structure(c(1579478400, 1579482000, 1579485600, 1579489200, 1579492800, 1579496400), tzone = "", class = c("POSIXct", "POSIXt")),
 avg_setpoint_before = c(19.6544642857143, 19.6544642857143, 19.6544642857143, 19.6544642857143, 19.6544642857143, 19.6544642857143 ),
 avg_setpoint_after = c(19.8759615384615, 19.8759615384615, 19.8759615384615, 19.8759615384615, 19.8759615384615, 19.8759615384615 ),
 sub19_before = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
 sub19_after = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
 avg_sp_before_excdflt = c(20.2888888888889, 20.2888888888889, 20.2888888888889, 20.2888888888889, 20.2888888888889, 20.2888888888889),
 avg_sp_after_excdflt = c(20.5075, 20.5075, 20.5075, 20.5075, 20.5075, 20.5075),
 thermo_change = c(0.221497252747252, 0.221497252747252, 0.221497252747252, 0.221497252747252, 0.221497252747252, 0.221497252747252),
 sub19_change = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
 thermo_change_excdflt = c(0.218611111111112, 0.218611111111112, 0.218611111111112, 0.218611111111112, 0.218611111111112, 0.218611111111112)), row.names = c(NA,
 -6L), class = c("tbl_df", "tbl", "data.frame"))
1 Like

I tried it a few ways, the as.Datetime and + method worked best.

structure(list(Date = structure(c(
  18281, 18281, 18281, 18281,
  18281, 18281
), class = "Date"), Time = structure(c(
  0, 3600, 7200,
  10800, 14400, 18000
), class = c("hms", "difftime"), units = "secs")), row.names = c(
  NA,
  -6L
), class = c("tbl_df", "tbl", "data.frame")) -> irus_data


library(tidyverse)
library(lubridate)

irus_data1 <- irus_data 
irus_data2 <- irus_data 
irus_data3 <- irus_data 
 
irus_data1$date_time <- as.POSIXct(paste(irus_data1$Date, irus_data1$Time))

irus_data2$date_time <- lubridate::make_datetime(
  year = year(irus_data2$Date),
  month = month(irus_data2$Date),
  day = day(irus_data2$Date),
  hour = hour(irus_data2$Time),
  min = min(irus_data2$Time),
  sec = second(irus_data2$Time)
)

all.equal(irus_data1, irus_data2)
# TRUE
irus_data3$date_time <- as_datetime(irus_data3$Date) + irus_data3$Time

all.equal(irus_data1, irus_data3)
# TRUE
library(microbenchmark)
microbenchmark(
  paster = irus_data1$date_time <- as.POSIXct(paste(irus_data1$Date, irus_data1$Time)),
  lubri_make = irus_data2$date_time <- lubridate::make_datetime(
    year = year(irus_data2$Date),
    month = month(irus_data2$Date),
    day = day(irus_data2$Date),
    hour = hour(irus_data2$Time),
    min = min(irus_data2$Time),
    sec = second(irus_data2$Time)
  ),
  lubri_dir = irus_data3$date_time <- as_datetime(irus_data3$Date) + irus_data3$Time,
  times = 1000L, unit = "us"
)

Unit: microseconds
expr             min     lq     mean median     uq    max neval cld
paster          545.0 558.50 662.9310 582.05 669.50 5081.7  1000   c
lubri_make      364.6 384.85 455.8287 399.50 455.25 6251.2  1000  b 
lubri_dir       140.7 152.40 176.8740 159.50 178.80  490.1  1000 a
1 Like

You can also consider this package for some use cases:
https://cran.r-project.org/web/packages/fasttime/index.html

1 Like

the fastPOSIXct function knocked it down from over a minute to about 15seconds, which is great and less code so I'll use that, thank you so much !

Yes, I've had dramatic improvements in the past, too.

The reason I didn't propose this initially is because it needs the dates and times to be formatted properly first, i.e. ymd_hms format, which is frequently not the case when the data is in character strings. Luckily your data was already in date and difftime formats.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.