Hello everyone, thank you for all your replies.
rray is the winner! It works really well. I am very happy with performance. It's within a factor of 2 of Julia after replacing naive "manually expand matrices and multiply" approach.
Here is a MWE showing an example of how it's used
library(rray)
mwe = function() {
n = 10
k = 5
e = array(1:100, dim = c(n, n, 1, 1))
p = array(1:50, dim = c(1, 1, n, k))
# need output of dimension n n n k
full_dims = c(n, n, n, k)
# Current approach
e_full = array(NA, dim = full_dims)
p_full = array(NA, dim = full_dims)
for(ni in 1:n){
for(ki in 1:k) {
e_full[ , , ni, ki] = e
}
}
for(ni in 1:n){
for(nj in 1:n) {
p_full[ni, nj, , ] = p
}
}
correct_out = p_full + e_full
t = sapply(e, function(x) x + p)
# This is likely performant, but gives the wrong answer! Not clear
# how to fil in the array the right way.
fast_out = array(t, dim = c(n, n, n, k))
e_b = rray_broadcast(e, dim = full_dims)
p_b = rray_broadcast(p, dim = full_dims)
rray_out = e_b + p_b
}