Does anyone know how I can get word count results filtered per document? My current code shows me the total number of occurrences of a bigram but for the entire PDF corpus rather than per document.
library(pdftools)
library(tm)
library(dplyr)
library(tidytext)
library(tidyr)
files = list.files(pattern = "pdf$")
files
all=lapply(files, pdf_text)
document= Corpus(VectorSource(all))
document= tm_map(document, content_transformer(tolower))
document= tm_map(document, removeNumbers)
document= tm_map(document, removeWords, stopwords("english"))
document= tm_map(document, removePunctuation)
PDFDataframe= data.frame(text = sapply(document, as.character),
stringsAsFactors = FALSE)
New_bigrams= PDFDataframe%>%
unnest_tokens(bigram, text, token= "ngrams", n= 2)
bigrams_separated= New_bigrams%>%
separate(bigram, c("word1", "word2"), sep= " ")
bigrams_filtered= bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigrams_filtered %>%
filter(word1== "information") %>%
count(word2== "security")