hello im a newbie and im trying to get to work with Rstudio for my thesis and i got some error while doing stemming, stopword, delete space, and save to csv on pre-processing. here is the code
setwd("F:/SKRIPSI/R pre-processing/data_preprocessing_test1")
rm(list = ls())
#install.packages("remotes")
#remotes::install_github("nurandi/katadasaR")
library(tm)
library(NLP)
library(stringr)
library(caret)
library(dplyr)
library(tau)
library(parallel)
library(readxl)
library(katadasaR)
library(tokenizers)
#memcaca dokumen excel
dok <- read_excel(path="data_preprocessing_test1.xlsx" ,
sheet="Sheet1",
col_names= TRUE)
View(dok)
#merubah file excel kedalam corpus dan select text, tulisan sesudah $ disesuaikan dengan nama tabel di data yang digunakan
corpusdok <- Corpus(VectorSource(dok$text))
inspect(corpusdok[1:10])
#mengubah semua huruf kapital menjadi huruf kecil pada dokumen sebelumnya yaitu corpusdok dengan memanfaatkan bantuan package tm dan tm_map.
dok_casefolding <- tm_map(corpusdok, content_transformer(tolower))
inspect(dok_casefolding[1:10])
#menghapus url pada dokumen sebelumnya yaitu dok_casefolding
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
dok_URL <- tm_map(dok_casefolding, content_transformer(removeURL))
inspect(dok_URL[1:10])
#menghapus mention pada dokumen sebelumnya yaitu dok_URL
remove.mention <- function(x) gsub("@\\S+", "", x)
dok_mention <- tm_map(dok_URL, remove.mention)
inspect(dok_mention[1:10])
#menghapus hastag
remove.hashtag <- function(x) gsub("#\\S+", "", x)
dok_hashtag <- tm_map(dok_mention, remove.hashtag)
inspect(dok_hashtag[1:10])
#menghapus tanda baca
dok_punctuation<-tm_map(dok_hashtag,content_transformer(removePunctuation))
inspect(dok_punctuation[1:10])
#menghapus angka
dok_nonumber<-tm_map(dok_punctuation, content_transformer(removeNumbers))
inspect(dok_nonumber[1:10])
#normalisasi perbaikan kata ejaan atau slang
slang <- read.csv("slangword_list.csv", header=T)
old_slang <- as.character(slang$old)
new_slang <- as.character(slang$new)
slangword<-function(x)Reduce(function(x,r)gsub(slang$old[r],slang$new[r],x,fixed=T),seq_len(nrow(slang)),x)
dok_slangword <- tm_map(dok_nonumber,slangword)
inspect(dok_slangword[1:10])
#stemming penguraian kata dasar
stem_text<-function(text,mc.cores=1)
{
stem_string<-function(str)
{
str<-tokenize(x=str)
str<-sapply(str,katadasaR)
str<-paste(str,collapse = "")
return(str)
}
x<-mclapply(X=text,FUN=stem_string,mc.cores=mc.cores)
return(unlist(x))
}
dok_stemming<-tm_map(dok_slangword,stem_text)
inspect(dok_stemming[1:10])
#filtering atau stopword penghapusan kata tidak berpengaruh
cStopwordID<-readLines("stopwords.csv")
dok_stopword <- tm_map(dok_stemming, removeWords, cStopwordID)
inspect(dok_stopword[1:10])
#menghapus spasi berlebihan
dok_whitespace <- tm_map(dok_stopword,stripWhitespace)
inspect(dok_whitespace[1:10])
#menyimpan file ke csv
databersih <- data.frame(text=unlist(sapply(dok_whitespace,`[`)), tringsAsFactors=F)
write.csv(databersih,file="datasesudah.csv")
and the following error
#stemming penguraian kata dasar
> stem_text<-function(text,mc.cores=1)
+ {
+ stem_string<-function(str)
+ {
+ str<-tokenize(x=str)
+ str<-sapply(str,katadasaR)
+ str<-paste(str,collapse = "")
+ return(str)
+ }
+ x<-mclapply(X=text,FUN=stem_string,mc.cores=mc.cores)
+ return(unlist(x))
+ }
> dok_stemming<-tm_map(dok_slangword,stem_text)
Error in tokenize(x = str) : unused argument (x = str) > inspect(dok_stemming[1:10])
Error in inspect(dok_stemming[1:10]) : object 'dok_stemming' not found
>
> #filtering atau stopword penghapusan kata tidak berpengaruh
> cStopwordID<-readLines("stopwords.csv")
> dok_stopword <- tm_map(dok_stemming, removeWords, cStopwordID)
Error in tm_map(dok_stemming, removeWords, cStopwordID) :
object 'dok_stemming' not found
> inspect(dok_stopword[1:10])
Error in inspect(dok_stopword[1:10]) : object 'dok_stopword' not found
>
> #menghapus spasi berlebihan
> dok_whitespace <- tm_map(dok_stopword,stripWhitespace)
Error in tm_map(dok_stopword, stripWhitespace) :
object 'dok_stopword' not found
> inspect(dok_whitespace[1:10])
Error in inspect(dok_whitespace[1:10]) :
object 'dok_whitespace' not found
>
> #menyimpan file ke csv
> databersih <- data.frame(text=unlist(sapply(dok_whitespace,`[`)), tringsAsFactors=F)
Error in lapply(X = X, FUN = FUN, ...) :
object 'dok_whitespace' not found
> write.csv(databersih,file="datasesudah.csv")
Error in is.data.frame(x) : object 'databersih' not found
i hope everybody can help, thank you in advance!