I am using the following packages:
library(readxl)
library(readr)
library(tensorflow)
library(keras)
library(text2vec)
My data set are some tweets I extracted via the API and cleaned with the tm package in a previous step.
The code looks the following:
tokens = word_tokenizer(model_data$clean_text)
v = create_vocabulary(itoken(tokens))
v = prune_vocabulary(v, term_count_min = 5, doc_proportion_max = 0.5)
it = itoken(tokens)
vectorizer = vocab_vectorizer(v)
word_index = tokens$word_index
dtm = create_dtm(it, vectorizer)
tcm = create_tcm(it, vectorizer, skip_grams_window = 5)
glove_model = GloVe$new(rank = 50, x_max = 10)
wv_main = glove_model$fit_transform(tcm, n_iter = 5)
word_vectors <- wv_main + t(glove_model$components)
rwmd_model = RelaxedWordMoversDistance$new(dtm, word_vectors)
rwms = rwmd_model$sim2(dtm[1:10, ])
head(sort(rwms[1, ], decreasing = T))
training_id <- sample.int(nrow(model_data), size=nrow(model_data)*0.8)
training <- model_data[training_id,]
testing <- model_data[-training_id,]
embeddings_index <- new.env(hash = TRUE, parent = emptyenv())
embedding_dim <- 100
embedding_matrix <- array(0, c(num_words, embedding_dim))
for (word in names(word_index)) {
index <- word_index[[word]]
if (index < max_words) {
embedding_vector <- embeddings_index[[word]]
if (!is.null(embedding_vector))
# Words not found in the embedding index will be all zeros.
embedding_matrix[index+1,] <- embedding_vector
}
}
model <- keras_model_sequential() %>%
layer_embedding(input_dim = num_words, output_dim = embedding_dim,
input_length = max_length) %>%
layer_flatten() %>%
layer_dense(units = 32, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
get_layer(model, index = 1) %>%
set_weights(list(embedding_matrix)) %>%
freeze_weights()
model %>% compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = list('accuracy')
)
history <- model %>% fit(
training$clean_text,
training$hate,
epochs = 100,
batch_size = 32,
validation_split = 0.2,
verbose = 2
)
results <- model %>% evaluate(testing$clean_text, testing$hate, verbose = 0)
results
Everything works fine except for the last two steps.