I need help with a text classification problem. I have a section of R code which works fine until I get to the caretList part, where I am trying to create an ensemble of classifiers for the text feature set. The code works up until the very last line where it fails. Here is the error that I get:
"Error in model.frame.default(form, data) : object is not a matrix"
Can someone please tell me what I am not doing right? I am still somewhat new to R. Thanks.
Here is my full code:
reviews = read.csv('movies.csv', stringsAsFactors = FALSE)
library(tidytext)
library(tm)
library(pander)
myCorpus <- Corpus(VectorSource(reviews$text))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myStopwords <- c(stopwords('english'), "the", "ages", "is", "and", "have","off","why")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ages", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "the", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "dont", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "and", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ive", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ano", replacement = " ")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
dictCorpus <- myCorpus
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
dtm <- DocumentTermMatrix(myCorpus)
features <- findFreqTerms(dtm, 10)
dtm2 <- DocumentTermMatrix(myCorpus, list(global = c(2, Inf),
dictionary = features))
removeSparseTerms(dtm2, .98)
inspect(dtm2)
N <- 10
findFreqTerms(dtm2, N)
library(caret)
train_idx <- createDataPartition(reviews[[1]], p=0.70, list=FALSE)
train1 <- reviews[train_idx,]
test1 <- reviews[-train_idx,]
train2 <- myCorpus[train_idx]
test2 <- myCorpus[-train_idx]
dict2 <- findFreqTerms(dtm2, lowfreq=10)
sms_train <- DocumentTermMatrix(train2, list(dictionary=dict2))
sms_test <- DocumentTermMatrix(test2, list(dictionary=dict2))
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
# x <- factor(x, levels = c(0, 1), labels = c("Absent", "Present"))
}
library(magrittr)
sms_train <- sms_train %>% apply(MARGIN=2, FUN=convert_counts)
sms_test <- sms_test %>% apply(MARGIN=2, FUN=convert_counts)
sms_train <- as.data.frame(sms_train)
sms_test <- as.data.frame(sms_test)
str(sms_train)
sms_train1 <- cbind(cat=factor(train1$class), sms_train)
sms_test1 <- cbind(cat=factor(test1$class), sms_test)
sms_train1<-as.data.frame(sms_train1)
sms_test1<-as.data.frame(sms_test1)
set.seed(12345)
library("data.table")
#install.packages("curl")
library("curl")
#install.packages("caretEnsemble")
library("caretEnsemble")
#install.packages("e1071")
library("e1071")
#install.packages("rpart")
library("rpart")
#install.packages("ISLR")
library("ISLR")
#install.packages("randomForest")
library("randomForest")
#install.packages("gbm")
library("gbm")
#install.packages("kernlab")
library("kernlab")
#install.packages("neuralnet")
library("neuralnet")
#install.packages("MASS")
library("MASS")
models=caretList(cat ~., data=sms_train1,
trControl=trainControl(method="cv",number=10,savePredictions=TRUE, classProbs=TRUE),
methodList=c('knn', 'lda', 'rpart'))