btw
feel free to copy this code, It's a loop only, but once you have it, only value to change is the name of Wikipedia's value name (and the colors if desired, words to remove and etc..).
I honestly think this is one of the coolest functions, combining two not-so-straight-forward packages.
for(i in 1:1){
page="RStudio"
bg_color="white"
word_color=c("#75aadb","#4c4c4c")
shape="star"
min_freq=5
remove_pattern = c("[.?]")
remove_words = c("the","off","is","an","by","and","in","a","for","of","on","to")
web_link <- paste(c("https://en.wikipedia.org/wiki/"),page,sep = "")
##
raw_text=rvest::read_html(web_link) %>% rvest::html_nodes("p") %>%
rvest::html_text()
clean.text <- str_to_lower(str_remove_all(
unlist(str_split(raw_text,pattern = " ")),
pattern = remove_pattern))
words.freq <- data.frame(words=clean.text,freq=1) %>%
group_by(words) %>% summarise(f=sum(freq)) %>%
arrange(desc(f)) %>% filter(!words %in%remove_words)
##
word_col <- sample(word_color,nrow(words.freq[words.freq$f>min_freq,])
cloud=wordcloud2::wordcloud2(words.freq[words.freq$f>min_freq,],
color = word_col,backgroundColor = bg_color,shape=shape)
##
print(cloud)
print(words.freq)
rm(list=ls())
}
I just kept modifying the function (loop) and now you can control the language of the value,
and also the minimum frequancy.
also, in this example, you can see how easy it is to just copy from google translte or something (using the unlist+str_split, no need to worry about qutes)
for(i in 1:1){
page="Porn"
lang="es"
bg_color="white"
word_color=c("red","black","deeppink")
shape="triangle"
min_freq=0
remove_pattern = c("[.'?,]")
remove_words = unlist(str_split(c("de la los a lo son por y el que en es un con las del"),pattern=" "))
web_link <- paste(c("https://"),lang,c(".wikipedia.org/wiki/"),page,sep = "")
##
raw_text=rvest::read_html(web_link) %>% rvest::html_nodes("p") %>%
rvest::html_text()
clean.text <- str_to_lower(str_remove_all(
unlist(str_split(raw_text,pattern = " ")),
pattern = remove_pattern))
words.freq <- data.frame(words=clean.text,freq=1) %>%
group_by(words) %>% summarise(f=sum(freq)) %>%
arrange(desc(f)) %>% filter(!words %in%remove_words)
##
word_col <- sample(word_color,nrow(words.freq[words.freq$f>min_freq,]),T)
cloud=wordcloud2::wordcloud2(words.freq[words.freq$f>min_freq,],
color = word_col,backgroundColor = bg_color,shape=shape)
##
print(cloud)
print(words.freq[words.freq$f>min_freq,])
rm(list=ls())
}
English wikipedia Porn:
English, minimum frequancy of a word is 2
By modifying the lang argument (and editing the words to remove) i printed the spanish outcome