I am trying to tokenize different documents in Rstudio, but because the documents are really big it gets messy when tokenizing it with 1 word in a row per document. Is there a solution to keep the tokenized words in 1 row? I first made a corpus and then transformed it into a tidytext.
My code:
#loading in files
all_files <- list.files(pattern = "pdf$")
all_pdf <- sapply(all_files, pdf_text)
# making a corpus
documents <- VCorpus(VectorSource(all_pdf))
# cleaning text
documents <- tm_map(documents, stripWhitespace)
documents <- tm_map(documents, content_transformer(tolower))
documents <- tm_map(documents, removeNumbers)
documents <- tm_map(documents, removeWords, stopwords("nl"))
documents <- tm_map(documents, removePunctuation)
# from corpus to tidy format
tidyformat <- documents %>% tidy()
# leave columns no info
tidyidtext <- tidyformat[-c(1:4,6:7)]
#vector municipality
municipality <- c("Achtkarpselen", "Amsterdam", "Apeldoorn", 'Dalfsen', "Delft",
"Diemen", "Eemnes", "Goeree-Overflakkee", "Gorinchem", "Gouda",
"Hattem", "Heiloo", "Hilvarenbeek", "Hoeksche Waard", "Leiderdorp",
"Meppel", "Noordoostpolder", "Oirschot", "Oldambt", "Rheden", "Roosendaal",
"Scherpenzeel", "Sliedrecht", "Tiel", "Twenterand", "Vaals", "Venray", "Vlissingen",
"Winterswijk", "Zeist")
# new column municipality name
tidyidtext$municipality <- municipality
tidyidmuntext <- tidyidtext[, c(1,3,2)]
# leave idnumber
tidymuntext <- tidyidmuntext[,-1]
# tokenize tidy way
tidymunword <- tidymuntext %>%
unnest_tokens(word, text)`
Thank you!