Text classification problem with caretensemble

jdude48 · August 8, 2018, 10:55pm

I need help with a text classification problem. I have a section of R code which works fine until I get to the caretList part, where I am trying to create an ensemble of classifiers for the text feature set. The code works up until the very last line where it fails. Here is the error that I get:
"Error in model.frame.default(form, data) : object is not a matrix"

Can someone please tell me what I am not doing right? I am still somewhat new to R. Thanks.
Here is my full code:

reviews = read.csv('movies.csv', stringsAsFactors = FALSE)

library(tidytext)
library(tm)
library(pander)

myCorpus <- Corpus(VectorSource(reviews$text))

myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myStopwords <- c(stopwords('english'), "the", "ages", "is", "and", "have","off","why")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ages", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "the", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "dont", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "and", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ive", replacement = " ")
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = "ano", replacement = " ")

idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
dictCorpus <- myCorpus
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)

dtm <- DocumentTermMatrix(myCorpus)
features <- findFreqTerms(dtm, 10)
dtm2 <- DocumentTermMatrix(myCorpus, list(global = c(2, Inf),
                                          dictionary = features))

removeSparseTerms(dtm2, .98)

inspect(dtm2)
N <- 10
findFreqTerms(dtm2, N)

library(caret)

train_idx <- createDataPartition(reviews[[1]], p=0.70, list=FALSE)

train1 <- reviews[train_idx,]
test1 <- reviews[-train_idx,]

train2 <- myCorpus[train_idx]
test2 <- myCorpus[-train_idx]


dict2 <- findFreqTerms(dtm2, lowfreq=10)

sms_train <- DocumentTermMatrix(train2, list(dictionary=dict2))
sms_test <- DocumentTermMatrix(test2, list(dictionary=dict2))

convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  # x <- factor(x, levels = c(0, 1), labels = c("Absent", "Present"))
}

library(magrittr)
sms_train <- sms_train %>% apply(MARGIN=2, FUN=convert_counts)
sms_test <- sms_test %>% apply(MARGIN=2, FUN=convert_counts)

sms_train <- as.data.frame(sms_train)
sms_test <- as.data.frame(sms_test)

str(sms_train)

sms_train1 <- cbind(cat=factor(train1$class), sms_train)
sms_test1 <- cbind(cat=factor(test1$class), sms_test)

sms_train1<-as.data.frame(sms_train1)
sms_test1<-as.data.frame(sms_test1)



set.seed(12345)

library("data.table")
#install.packages("curl")
library("curl")
#install.packages("caretEnsemble")
library("caretEnsemble")
#install.packages("e1071")
library("e1071")
#install.packages("rpart")
library("rpart")
#install.packages("ISLR")
library("ISLR")
#install.packages("randomForest")
library("randomForest")
#install.packages("gbm")
library("gbm")
#install.packages("kernlab")
library("kernlab")
#install.packages("neuralnet")
library("neuralnet")
#install.packages("MASS")
library("MASS")

models=caretList(cat ~., data=sms_train1, 
                 trControl=trainControl(method="cv",number=10,savePredictions=TRUE, classProbs=TRUE),
                 methodList=c('knn', 'lda', 'rpart'))

Max · August 9, 2018, 3:05pm

Can you send the results of sessioninfo::session_info() and show the output when you run these commands?

We generally need a small reproducible example to help.

jdude48 · August 9, 2018, 3:46pm

Here is the output that I get when I run the session_info() command

> session_info()

Session info ---------------------------------------------------------------------------------------------

setting  value                      

version  R version 3.5.1 (2018-07-02)

system   x86_64, mingw32            

ui       RStudio (1.1.383)          

language (EN)                        

collate  English_United States.1252  

tz       America/New_York            

date     2018-08-09                  

Packages -------------------------------------------------------------------------------------------------

package       * version    date       source        

abind           1.4-5      2016-07-21 CRAN (R 3.5.0)

assertthat      0.2.0      2017-04-11 CRAN (R 3.5.1)

backports       1.1.2      2017-12-13 CRAN (R 3.5.0)

base          * 3.5.1      2018-07-02 local        

bindr           0.1.1      2018-03-13 CRAN (R 3.5.1)

bindrcpp        0.2.2      2018-03-29 CRAN (R 3.5.1)

broom           0.5.0      2018-07-17 CRAN (R 3.5.1)

caret         * 6.0-80     2018-05-26 CRAN (R 3.5.1)

caretEnsemble * 2.0.0      2016-02-07 CRAN (R 3.5.1)

class           7.3-14     2015-08-30 CRAN (R 3.5.1)

codetools       0.2-15     2016-10-05 CRAN (R 3.5.1)

colorspace      1.3-2      2016-12-14 CRAN (R 3.5.1)

compiler        3.5.1      2018-07-02 local        

crayon          1.3.4      2017-09-16 CRAN (R 3.5.1)

curl          * 3.2        2018-03-28 CRAN (R 3.5.1)

CVST            0.2-2      2018-05-26 CRAN (R 3.5.1)

data.table    * 1.11.4     2018-05-27 CRAN (R 3.5.1)

datasets      * 3.5.1      2018-07-02 local        

ddalpha         1.3.4      2018-06-23 CRAN (R 3.5.1)

DEoptimR        1.0-8      2016-11-19 CRAN (R 3.5.0)

devtools      * 1.13.6     2018-06-27 CRAN (R 3.5.1)

digest          0.6.15     2018-01-28 CRAN (R 3.5.1)

dimRed          0.1.0      2017-05-04 CRAN (R 3.5.1)

dplyr           0.7.6      2018-06-29 CRAN (R 3.5.1)

DRR             0.0.3      2018-01-06 CRAN (R 3.5.1)

e1071         * 1.7-0      2018-07-28 CRAN (R 3.5.1)

foreach         1.4.4      2017-12-12 CRAN (R 3.5.1)

gbm           * 2.1.3      2017-03-21 CRAN (R 3.5.1)

geometry        0.3-6      2015-09-09 CRAN (R 3.5.1)

ggplot2       * 3.0.0      2018-07-03 CRAN (R 3.5.1)

glue            1.3.0      2018-07-17 CRAN (R 3.5.1)

gower           0.1.2      2017-02-23 CRAN (R 3.5.1)

graphics      * 3.5.1      2018-07-02 local        

grDevices     * 3.5.1      2018-07-02 local        

grid            3.5.1      2018-07-02 local        

gridExtra       2.3        2017-09-09 CRAN (R 3.5.1)

gtable          0.2.0      2016-02-26 CRAN (R 3.5.1)

ipred           0.9-6      2017-03-01 CRAN (R 3.5.1)

ISLR          * 1.2        2017-10-20 CRAN (R 3.5.1)

iterators       1.0.10     2018-07-13 CRAN (R 3.5.1)

janeaustenr     0.1.5      2017-06-10 CRAN (R 3.5.1)

kernlab       * 0.9-26     2018-04-30 CRAN (R 3.5.0)

lattice       * 0.20-35    2017-03-25 CRAN (R 3.5.1)

lava            1.6.2      2018-07-02 CRAN (R 3.5.1)

lazyeval        0.2.1      2017-10-29 CRAN (R 3.5.1)

lubridate       1.7.4      2018-04-11 CRAN (R 3.5.1)

magic           1.5-8      2018-01-26 CRAN (R 3.5.0)

magrittr      * 1.5        2014-11-22 CRAN (R 3.5.1)

MASS          * 7.3-50     2018-04-30 CRAN (R 3.5.1)

Matrix          1.2-14     2018-04-13 CRAN (R 3.5.1)

memoise         1.1.0      2017-04-21 CRAN (R 3.5.1)

methods       * 3.5.1      2018-07-02 local        

ModelMetrics    1.1.0      2016-08-26 CRAN (R 3.5.1)

munsell         0.5.0      2018-06-12 CRAN (R 3.5.1)

neuralnet     * 1.33       2016-08-16 CRAN (R 3.5.1)

nlme            3.1-137    2018-04-07 CRAN (R 3.5.1)

NLP           * 0.1-11     2017-08-15 CRAN (R 3.5.0)

nnet            7.3-12     2016-02-02 CRAN (R 3.5.1)

pander        * 0.6.2      2018-07-08 CRAN (R 3.5.1)

parallel      * 3.5.1      2018-07-02 local        

pbapply         1.3-4      2018-01-10 CRAN (R 3.5.0)

pillar          1.3.0      2018-07-14 CRAN (R 3.5.1)

pkgconfig       2.0.1      2017-03-21 CRAN (R 3.5.1)

pls             2.6-0      2016-12-18 CRAN (R 3.5.1)

plyr            1.8.4      2016-06-08 CRAN (R 3.5.1)

prodlim         2018.04.18 2018-04-18 CRAN (R 3.5.1)

purrr           0.2.5      2018-05-29 CRAN (R 3.5.1)

R6              2.2.2      2017-06-17 CRAN (R 3.5.1)

randomForest  * 4.6-14     2018-03-25 CRAN (R 3.5.1)

Rcpp            0.12.18    2018-07-23 CRAN (R 3.5.1)

RcppRoll        0.3.0      2018-06-05 CRAN (R 3.5.1)

recipes         0.1.3      2018-06-16 CRAN (R 3.5.1)

reshape2        1.4.3      2017-12-11 CRAN (R 3.5.1)

rlang           0.2.1      2018-05-30 CRAN (R 3.5.1)

robustbase      0.93-2     2018-07-27 CRAN (R 3.5.1)

rpart         * 4.1-13     2018-02-23 CRAN (R 3.5.1)

scales          0.5.0      2017-08-24 CRAN (R 3.5.1)

sfsmisc         1.1-2      2018-03-05 CRAN (R 3.5.1)

slam            0.1-43     2018-04-23 CRAN (R 3.5.0)

SnowballC       0.5.1      2014-08-09 CRAN (R 3.5.0)

splines       * 3.5.1      2018-07-02 local        

stats         * 3.5.1      2018-07-02 local        

stats4          3.5.1      2018-07-02 local        

stringi         1.1.7      2018-03-12 CRAN (R 3.5.0)

stringr         1.3.1      2018-05-10 CRAN (R 3.5.1)

survival      * 2.42-3     2018-04-16 CRAN (R 3.5.1)

tibble          1.4.2      2018-01-22 CRAN (R 3.5.1)

tidyr           0.8.1      2018-05-18 CRAN (R 3.5.1)

tidyselect      0.2.4      2018-02-26 CRAN (R 3.5.1)

tidytext      * 0.1.9      2018-05-29 CRAN (R 3.5.1)

timeDate        3043.102   2018-02-21 CRAN (R 3.5.1)

tm            * 0.7-5      2018-07-29 CRAN (R 3.5.1)

tokenizers      0.2.1      2018-03-29 CRAN (R 3.5.1)

tools           3.5.1      2018-07-02 local        

utils         * 3.5.1      2018-07-02 local        

withr           2.1.2      2018-03-15 CRAN (R 3.5.1)

 xml2            1.2.0      2018-01-24 CRAN (R 3.5.1)