In my experience data.table is usually faster than dplyr, and in many cases much faster. Even in the first example data.table was 2 to 10 times faster, depending on which variation of dplyr you are comparing to. And if it isn't obvious going in which variations of dplyr are slower, you don't know you are not using a slow variation of dplyr in your own work (it would be a pain to try all dplyr syntaxes for every problem and then try to settle on the least slow variation). Another source of speed variation in dplyr is grouped filtering.
Here is an example where it is routinely 10x faster over a wide range of problem sizes: http://www.win-vector.com/blog/2018/06/rqdatatable-rquery-powered-by-data-table/ .
And the new rqdatatable package lets one use a piped-Codd style syntax if that is what you are used to.
suppressPackageStartupMessages(library("tidyverse"))
library("microbenchmark")
library("data.table")
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
#> The following object is masked from 'package:purrr':
#>
#> transpose
library("rqdatatable")
#> Loading required package: rquery
counter <- 0
df <- list()
for (i in 1:10) {
for (j in 1:12) {
for (k in 1:50) {
counter <- counter + 1
df[[counter]] <- tibble(
name = "name",
id = seq(1,5500),
day = i,
time = j,
mbr = k,
fcst = rnorm(5500)
)
}
}
}
df <- bind_rows(df)
no_pronoun <- function(data) {
data %>%
group_by(name, id, day, time) %>%
summarise(fcst = mean(fcst))
}
with_pronoun <- function(data) {
data %>%
group_by(.data$name, .data$id, .data$day, .data$time) %>%
summarise(fcst = mean(.data$fcst))
}
with_data.table <- function(data) {
dT <- as.data.table(data)
dT[ , j = list("fcst" = mean(fcst)), by = c("name", "id", "day", "time")]
}
with_rqdatatable <- function(data) {
local_td(data) %.>%
project_nse(.,
groupby = c("name", "id", "day", "time"),
fcst = mean(fcst)) %.>%
ex_data_table(.)
}
microbenchmark(no_pronoun(df),
with_pronoun(df),
with_data.table(df),
with_rqdatatable(df),
times = 5)
#> Unit: seconds
#> expr min lq mean median uq
#> no_pronoun(df) 6.948234 7.856568 8.616359 8.514090 9.569410
#> with_pronoun(df) 39.785763 40.689116 41.957031 41.584065 41.850011
#> with_data.table(df) 3.442202 3.703897 4.000662 3.790576 4.043888
#> with_rqdatatable(df) 3.045504 3.314732 3.997600 3.982759 4.755187
#> max neval
#> 10.193493 5
#> 45.876202 5
#> 5.022747 5
#> 4.889819 5