This is the code that I am running
library(tidyverse)
library(tidyselect)
set.seed(07242021)
library(tidytext)
#install.packages("quanteda")
#library(quanteda)
library(textclean)
#library(SnowballC)
#install.packages('hunspell')
library(hunspell)
#install.packages("stopwords")
library(stopwords)
#install.packages("widyr")
library(widyr)
Corpus <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# create te directory if it does not already exists then reads in the data.
i dir.create(file.path(dir, "Coursera-SwiftKey"), showWarnings = TRUE) #if dir does not exists then creates it
setwd(file.path(dir, "Coursera-SwiftKey")) # sets the created dir to the current working dir
temp <- tempfile() # makes temp file to unzip the swiftKeys data
download.file(Corpus,temp) # downloads to zipfile to the temp file
file_names <- list.files("file")
walk(temp, ~ unzip(zipfile = temp))
blogData <-readLines("/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8")
newsData <-readLines("/Coursera-SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8")
twitterData <-readLines("/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding ="UTF-8")
##### using all of the data is not possible becuase the allocation size
### needed is greater than R can allow for vector in subsiquent steps ###
#### so we will take samples of 20% of each type
blogDatasample <-sample(blogData, length(blogData)*0.1)
newsDatasample <-sample(newsData , length(newsData)*0.1)
twitterDatasample <-sample(twitterData,length(twitterData)* 0.1)
newsData_c <-as_tibble(newsDatasample)
colnames(newsData_c)<-c("word")
twitterData_c <-as_tibble(twitterDatasample)
colnames(twitterData_c) <-c("word")
blogData_c <-as_tibble(blogDatasample)
colnames(blogData_c) <-c("word")
AllData <- bind_rows(mutate(twitterData_c,source_type = "Twitter"),
mutate(blogData_c, source_type = "News"),
mutate(newsData_c, source_type = "Blog"))
AllData$word<- gsub("[^[:alnum:]]", " ", AllData$word )
AllData$word<-gsub("\\d", ' ', AllData$word)
AllData$word <-gsub("\\b[t]\\b", "not", AllData$word)
AllData$word <-gsub("\\s[ll]\\s", "will", AllData$word)
AllData$word<-gsub("\\s[ve]\\s", "have", AllData$word)
rm( newsData_c, twitterData_c, newsData, twitterData, blogData, blogDatasample, newsDatasample, twitterDatasample, blogData_c)
gc()
AllData1gram <-AllData %>%
unnest_tokens(word, word) %>%
count(word, sort = TRUE) %>%
mutate(p = n / sum(n))
AllData_trigram<-AllData%>%
unnest_tokens(trigram,word, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
AllData_trigram <-AllData_trigram%>%
mutate(p = n / sum(n))
AllData_trigram <-AllData_trigram%>%
filter(word1 != word2)
AllDatahexagram <-AllData%>%
unnest_tokens(ngram, word, token = "ngrams", n = 6) %>%
mutate(ngramID = row_number()) %>%
unite(skipgramID, source_type, ngramID) %>%
unnest_tokens(word, ngram)
rm(AllData)
gc()
skipgram_probs <- AllDatahexagram %>%
pairwise_count(word, skipgramID, diag = TRUE, sort = TRUE)%>%
mutate(p = n / sum(n))
normalized_prob_tri <- skipgram_probs%>%
rename(word1 = item1, word2 = item2) %>%
left_join(AllData1gram %>%
select(word1 = word, p1 = p),
by = "word1") %>%
left_join(AllData1gram %>%
select(word2 = word, p2 = p),
by = "word2") %>%
mutate(p_together = p / p1 / p2)
normalized_prob_tri<-normalized_prob_tri%>%
filter(word1!=word2)
AllData_trigram <-AllData_trigram%>%
filter(word1!=word2)%>%
rename(p3=p)%>%
select(word2, word3, p3)
memory.limit(9999999999)
normalized_prob_tri<-normalized_prob_tri%>%
left_join(AllData_trigram, by = "word2") %>%
mutate(p_together = p / p1 / p2/p3)
gc()
pmi_matrix <- normalized_prob_tri %>%
mutate(pmi = log10(p_together)) %>%
cast_sparse(word1, word2, pmi)
This is what the Rstudio that I am using in the upload. If I am limited by processing power for this.