Hi,
Below is reprex.
Example 1 works as expected, but why does Example 2 fail?
The goal of Example #2 is to use the same model (stored in model
list-columns) repeatedly for varying M
(as defined in CalcProbCoherence()
).
Look forward to insights.
Thanks!
#Example 1
library(tidyverse)
TestFun <- function(.a, .b) {
.a + .b
}
TestFun(1, 10)
#> [1] 11
df <- tibble(a = 1:10)
df %>%
crossing(b = 11:20) %>%
nest(b) %>%
mutate(test = map2(a, data, ~TestFun(.x, .y)))
#> # A tibble: 10 x 3
#> a data test
#> <int> <list> <list>
#> 1 1 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 2 2 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 3 3 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 4 4 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 5 5 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 6 6 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 7 7 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 8 8 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 9 9 <tibble [10 x 1]> <data.frame [10 x 1]>
#> 10 10 <tibble [10 x 1]> <data.frame [10 x 1]>
#Example 2
library(textmineR)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following object is masked from 'package:tidyr':
#>
#> expand
# taken from
# https://github.com/TommyJones/textmineR/blob/88451a25f0bebb6d7176a872cf9df5baa95fb8c2/R/evaluation_metrics.R
CalcProbCoherence <- function(phi, dtm, M = 5) {
# phi is a numeric matrix or numeric vector?
if (!is.numeric(phi)) {
stop(
"phi must be a numeric matrix whose rows index topics and columns\n",
" index terms or phi must be a numeric vector whose entries index terms."
)
}
# is dtm a matrix we can work with?
if (!is.matrix(dtm) &
!class(dtm) %in% c("dgCMatrix", "dgTMatrix", "dgeMatrix", "dgRMatrix")) {
stop(
"dtm must be a matrix. This can be a standard R dense matrix or a\n",
" matrix of class dgCMatrix, dgTMatrix, dgRMatrix, or dgeMatrix"
)
}
# is M numeric? If it is not an integer, give a warning.
if (!is.numeric(M) | M < 1) {
stop("M must be an integer in 1:ncol(phi) or 1:length(phi)")
}
if (length(M) != 1) {
warning("M is a vector when scalar is expected. Taking only the first value")
M <- M[ 1 ]
}
if (floor(M) != M) {
warning("M is expected to be an integer. floor(M) is being used.")
M <- floor(M)
}
# dtm has colnames?
if (is.null(colnames(dtm))) {
stop("dtm must have colnames")
}
# Names of phi in colnames(dtm)
if (!is.matrix(phi)) {
if (sum(names(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)) {
stop("names(phi)[ 1:M ] are not in colnames(dtm)")
}
} else if (sum(colnames(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)) {
stop("colnames(phi)[ 1:M ] are not in colnames(dtm)")
}
# Declare a function to get probabilistic coherence on one topic
pcoh <- function(topic, dtm, M) {
terms <- names(topic)[order(topic, decreasing = TRUE)][1:M]
dtm.t <- dtm[, terms]
dtm.t[dtm.t > 0] <- 1
count.mat <- Matrix::t(dtm.t) %*% dtm.t
num.docs <- nrow(dtm)
p.mat <- count.mat / num.docs
# result <- sapply(1:(ncol(count.mat) - 1), function(x) {
# mean(p.mat[x, (x + 1):ncol(p.mat)]/p.mat[x, x] - Matrix::diag(p.mat)[(x +
# 1):ncol(p.mat)], na.rm = TRUE)
# })
# mean(result, na.rm = TRUE)
result <- sapply(1:(ncol(count.mat) - 1), function(x) {
p.mat[x, (x + 1):ncol(p.mat)] / p.mat[x, x] -
Matrix::diag(p.mat)[(x + 1):ncol(p.mat)]
})
mean(unlist(result), na.rm = TRUE)
}
# if phi is a single topic vector get that one coherence
if (!is.matrix(phi)) {
return(pcoh(topic = phi, dtm = dtm, M = M))
}
# Otherwise, do it for all the topics
apply(phi, 1, function(x) {
pcoh(topic = x, dtm = dtm, M = M)
})
}
set.seed(12345)
doc_topic_prior <- 0.1
topic_word_prior <- 0.01
tic()
#> Error in tic(): could not find function "tic"
lda_model_tR <- FitLdaModel(
dtm = nih_sample_dtm,
k = 4,
iterations = 100, # I usually recommend at least 500 iterations or more
burnin = 20,
alpha = doc_topic_prior,
beta = topic_word_prior,
optimize_alpha = T,
calc_likelihood = T,
calc_coherence = T,
calc_r2 = T,
cpus = 2
)
toc()
#> Error in toc(): could not find function "toc"
tibble(model_num = 1) %>%
crossing(b = 10:20) %>%
nest(b) %>%
mutate(model = list(lda_model_tR)) %>%
mutate(test = map2(model, data, ~CalcProbCoherence(.x$phi, dtm = nih_sample_dtm, M = .y)))
#> Warning in if (!is.numeric(M) | M < 1) {: the condition has length > 1 and
#> only the first element will be used
#> Error in mutate_impl(.data, dots): Evaluation error: M must be an integer in 1:ncol(phi) or 1:length(phi).
Created on 2019-01-03 by the reprex package (v0.2.1)