simple example for text categorization

Hi all.
I cant classify text using simple data and example.
Here data:

животное	животное лиса волк заяц
растение	растение выращивание пшеница агрохолдинг
люди	люди ресторан еда кинотеатр айфон портфель бизнес
археология	археология раскопки черепок ваза кости динозавр
математика	математика плюс делить произведение частное формула
отдых	отдых море ресторан еда пляж арктика эверест лыжи

here code for training:


Sys.setlocale("LC_ALL", 'ru_RU.CP1251')

mydata = read.delim("data.txt",header=TRUE,"\t",encoding="CP1251")


# Make a vector source
mydata_vector <- VectorSource(mydata$WORDS[1:3])

# Make a volatile corpus
mydata_corpus <- VCorpus(mydata_vector)

# Print out mydata_corpus

# Print data on the 15th row in mydata_corpus

# Print the content of the 15th row in mydata_corpus

# Alter the function code to match the instructions
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(bracketX))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "ё", replacement = "е")))
  #через стоп-слова почему-то не удаляется этот символ
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "№", replacement = " ")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = '(сч.|счет|дог.|договор|документ|счф)(а|у)? [^о]+от [^ ]+ ', replacement = "")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "сумм(е|а)", replacement = "")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "[./-]", replacement = " ")))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, c(stopwords("ru"), "№", "т ч", "том числе", "включая", "ндс", "rub","руб","коп","по счетам","по счету","счет", "счф", "дог", "договору", "оплата","этапу","года"))
  #все слова длиной два и менее
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "\\b\\S{1,2}\\b", replacement = "")))
  corpus <- tm_map(corpus, stripWhitespace)

clean_corp <- clean_corpus(mydata_corpus)

# Print out a cleaned up text

# Print out the same text in original form

# The RWeka package is already loaded
# Define bigram tokenizer
tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))

# Create tdm and matrix with tf-idf weighting
#  control = list(weighting = weightTfIdf, tokenize = tokenizer)
tf_idf_dtm <- DocumentTermMatrix(
  #control = list(tokenize = tokenizer)
  list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE)
tf_idf_dtm_m <- as.matrix(tf_idf_dtm)


#tf_idf_dtm_m.sorted=tf_idf_dtm_m[order(tf_idf_dtm_m[,2], decreasing = TRUE),]

# Print the dimensions of the matrix

tf_idf_dtm_m[1:3, 1:15]

#tf_idf_dtm_m.sorted[1:20, 1:6]

CATEGORY.factor = as.factor(mydata$CATEGORY)

tf_idf_dtm_m2 <- cbind(tf_idf_dtm_m, c(0, 1, 2) )
colnames(tf_idf_dtm_m2)[ncol(tf_idf_dtm_m2)] <- 'CATEGORY'
tf_idf_dtm_m3 <-
tf_idf_dtm_m3$CATEGORY <- as.factor(tf_idf_dtm_m3$CATEGORY)


# Train.
# list of all available methods: names(getModelInfo())
# Bayesian Generalized Linear Model
fit <- train(CATEGORY ~ ., data = tf_idf_dtm_m3, method = 'bayesglm')

this code raise warnings:

Warning messages:
1: predictions failed for Resample03: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

2: predictions failed for Resample10: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

3: predictions failed for Resample15: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

4: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.

and cant classify words correctly:

# Test data.
test_data <- c('заяц')
corpus <- VCorpus(VectorSource(test_data))
test_dtm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tf_idf_dtm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test_matrix <- as.matrix(test_dtm)

# Check accuracy on test.
result = predict(fit, newdata = test_matrix)

paste("type of '",test_data,"' word is: '",mydata$CATEGORY[result],"'", sep="")


  1. Why warnings appears, whats wrong with code?
  2. Why prediction returns incorrect result (because "заяц" word is included only in one category = "животные", but code returns "растения" category )?

