set.seed(1982)
N <- 100
M <- 5
K <- 3
example_3d_array <- array(rnorm(N*M*K), c(N, M, K))
I want to create a data frame with 4 columns (N,M,K,Value) and 1500 rows (NMK). I could do this with 3 nested for-loops but it would be slow. Is there a better way of doing this?
This is how I would do it with Rcpp not sure if there is a better way:
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix cube_to_matrix(arma::cube my_cube) {
int row = 0 ;
int N = arma::size(my_cube)[0];
int M = arma::size(my_cube)[1];
int K = arma::size(my_cube)[2];
NumericMatrix A(N*M*K,4);
for(int k = 0; k < K; k++){
for(int m = 0; m < M; m++){
for(int n = 0; n < N; n++){
A(row,0) = k + 1;
A(row,1) = m + 1;
A(row,2) = n + 1;
A(row,3) = my_cube[n,n,k];
row++;
}
}
}
colnames(A) = CharacterVector::create("K", "M", "N", "val");
return A;
}
I've had success converting large 3d arrays into data.frames with dplyr::as.tbl_cube. The only gotcha is that the dimensions must be named, otherwise it will fail with an error message.
set.seed(1982)
N <- 100
M <- 5
K <- 3
example_3d_array <- array(rnorm(N*M*K), c(N, M, K))
library("dplyr")
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
system.time(expr = {
dimnames(example_3d_array) <- list("N" = sprintf("N%d", 1:N),
"M" = sprintf("M%d", 1:M),
"K" = sprintf("K%d", 1:K))
example_3d_cube <- as.tbl_cube(example_3d_array)
example_2d_df <- as_tibble(example_3d_cube)
}
)
#> user system elapsed
#> 0.003 0.000 0.003
head(example_2d_df)
#> # A tibble: 6 x 4
#> N M K example_3d_array
#> <chr> <chr> <chr> <dbl>
#> 1 N1 M1 K1 0.685
#> 2 N2 M1 K1 -0.00555
#> 3 N3 M1 K1 -0.778
#> 4 N4 M1 K1 1.88
#> 5 N5 M1 K1 -0.377
#> 6 N6 M1 K1 -0.455