It might be that your analysis is too ambitious to be executed in 'reasonable' time given the compute you have access to. (pay to run analysis on a cloud service ?)
Is the arf of every token in your corpus of equal value to you, or would you be interested in the lowest / or / highest. because, perhaps you can get further with a heirarchical approach where you bootstrap sample on smaller subsets of corpus, before narrowing focus on what to analyse.
Anyways, here is a parallel approach with logging and batching. It was interesting to me to work on this.
library(purrr) # used for iteration
library(tibble)
library(dplyr)
library(tictoc) # only used for benchmarking the time taken for the code to run
# devtools::install_github("gadenbuie/lorem")
# use lorem ipsum words for example text
set.seed(42)
ordered_words_of_text <- lorem::ipsum_words(3000, collapse = FALSE)
num_words <- length(ordered_words_of_text)
unique_words <- length(unique(ordered_words_of_text))
freq_of_unique_words <- map_int(
unique(ordered_words_of_text),
~ length(ordered_words_of_text[ordered_words_of_text == .])
)
names(freq_of_unique_words) <- unique(ordered_words_of_text)
word_positions_list <- map(
unique(ordered_words_of_text),
~ which(ordered_words_of_text == .)
)
new_vec <- function(vlength, vtrue_pos_list) {
nv <- vector(length = vlength)
nv[vtrue_pos_list] <- TRUE
nv
}
realised_word_pos_list <- map(
word_positions_list,
~ new_vec(num_words, .)
)
avg_red_freq <- function(wordvec, num_words, split_length, split_into) {
map_dbl(
0:(split_length - 1),
~ aggregate(wordvec, by = list((ceiling((1:num_words + .) / split_length) %% split_into) + 1), max) %>%
colSums() %>%
tail(n = 1)
) %>% sum() / split_length
}
# make a tibble to contain results
# every batch run will accumulate into this
final_store <- tibble(
word = NA_character_,
arf = NA_real_
) %>% filter(!is.na(word))
library(doParallel)
library(foreach)
writeLines(c(""), "log.txt") # clear out any old log
run_as_parallel_with_batch <- function(final_store,
batch_label,
batch_start,
batch_end,
realised_word_pos_list,
ordered_words_of_text,
freq_of_unique_words,
num_words) {
batch_of_word_pos_list <- realised_word_pos_list[batch_start:batch_end]
batch_of_unique_words <- unique(ordered_words_of_text)[batch_start:batch_end]
batch_freq_of_unique_words <- freq_of_unique_words[batch_start:batch_end]
## do it in parallel
tictoc::tic(msg = "start 4-core parallel algorithm")
sink("log.txt", append = TRUE)
cat("\n START batch ", batch_label, " at ", format(Sys.time(), "%X %Y-%m-%d"))
sink()
cl <- makeCluster(4)
registerDoParallel(cl)
avg_red_freq <- function(wordvec, num_words, split_length, split_into) {
map_dbl(
0:(split_length - 1),
~ aggregate(wordvec, by = list((ceiling((1:num_words + .) / split_length) %% split_into) + 1), max) %>%
colSums() %>%
tail(n = 1)
) %>% sum() / split_length
}
par_res <- foreach(
i = 1:length(batch_of_word_pos_list),
.packages = "purrr"
) %dopar% {
outstring <- paste0(
"\nprocessing ", i, " of ", length(batch_of_word_pos_list),
" from batch:", batch_label, " ", batch_start, "-", batch_end, " ", batch_of_unique_words[[i]]
)
sink("log.txt", append = TRUE)
cat(outstring)
sink()
avg_red_freq(
wordvec = batch_of_word_pos_list[[i]],
num_words = num_words,
split_length = floor(num_words / batch_freq_of_unique_words[[i]]),
split_into = batch_freq_of_unique_words[[i]]
)
}
stopCluster(cl)
tictoc::toc()
sink("log.txt", append = TRUE)
cat("\n END batch ", batch_label, " at ", format(Sys.time(), "%X %Y-%m-%d"))
cat("\n")
sink()
txt_of_log <- readLines(con = "log.txt")
walk(txt_of_log, print)
res2 <- tibble(
word = batch_of_unique_words,
arf = par_res %>% unlist()
) %>% arrange(desc(arf))
final_store <- union(final_store, res2)
}
## EXAMPLE OF RUNNING AN ARBITRARY BATCH, I.E. COULD REPEAT RUN THIS MANUALLY with differenct batch start and end parameters
# final_store <- run_as_parallel_with_batch(
# final_store = final_store,
# batch_label = 2,
# batch_start = 1,
# batch_end = 20,
# realised_word_pos_list = realised_word_pos_list,
# ordered_words_of_text = ordered_words_of_text,
# freq_of_unique_words = freq_of_unique_words,
# num_words = num_words
# )
# final_store
# EXAMPLE OF SETTING UP purrr pwalk to cover some of these
# my example data of 176 unique words divides into 8 batches length 22
num_tot_batches <- 8
batch_length <- 22
batch_labels <- 1:num_tot_batches
batch_starts <- (batch_labels - 1) * batch_length + 1
batch_ends <- (batch_labels - 1) * batch_length + batch_length
## param pass demo
pwalk(
.l = list(
batch_labels,
batch_starts,
batch_ends
),
.f = ~ cat("\n", ..1, " ", ..2, " ", ..3)
)
pwalk(
.l = list(
batch_labels,
batch_starts,
batch_ends
),
.f = ~ {
final_store <<- run_as_parallel_with_batch(
final_store = final_store,
batch_label = ..1,
batch_start = ..2,
batch_end = ..3,
realised_word_pos_list = realised_word_pos_list,
ordered_words_of_text = ordered_words_of_text,
freq_of_unique_words = freq_of_unique_words,
num_words = num_words
)
}
)
final_store %>% arrange(desc(arf))
> final_store %>% arrange(desc(arf))
# A tibble: 176 x 2
word arf
<chr> <dbl>
1 auctor 18.5
2 fusce 18.3
3 ultrices 18
4 ut 16.7
5 accumsan 16.6
6 hendrerit 16.0
7 tincidunt 16.0
8 magna 15.9
9 dignissim 15.7
10 vehicula 15.5
# ... with 166 more rows