simple example for text categorization

shredder · October 10, 2019, 2:11pm

Hi all.
I cant classify text using simple data and example.
Here data:

CATEGORY	WORDS
животное	животное лиса волк заяц
растение	растение выращивание пшеница агрохолдинг
люди	люди ресторан еда кинотеатр айфон портфель бизнес
археология	археология раскопки черепок ваза кости динозавр
математика	математика плюс делить произведение частное формула
отдых	отдых море ресторан еда пляж арктика эверест лыжи

here code for training:

library(tm)
library(qdap)

Sys.setlocale("LC_ALL", 'ru_RU.CP1251')

mydata = read.delim("data.txt",header=TRUE,"\t",encoding="CP1251")

mydata$WORDS[1]

# Make a vector source
mydata_vector <- VectorSource(mydata$WORDS[1:3])

# Make a volatile corpus
mydata_corpus <- VCorpus(mydata_vector)

# Print out mydata_corpus
mydata_corpus

# Print data on the 15th row in mydata_corpus
mydata_corpus[[1]]

# Print the content of the 15th row in mydata_corpus
mydata_corpus[[1]]$content

# Alter the function code to match the instructions
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(bracketX))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "ё", replacement = "е")))
  #через стоп-слова почему-то не удаляется этот символ
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "№", replacement = " ")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = '(сч.|счет|дог.|договор|документ|счф)(а|у)? [^о]+от [^ ]+ ', replacement = "")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "сумм(е|а)", replacement = "")))
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "[./-]", replacement = " ")))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, c(stopwords("ru"), "№", "т ч", "том числе", "включая", "ндс", "rub","руб","коп","по счетам","по счету","счет", "счф", "дог", "договору", "оплата","этапу","года"))
  #все слова длиной два и менее
  corpus <- tm_map(corpus, content_transformer(function(x) gsub(x, pattern = "\\b\\S{1,2}\\b", replacement = "")))
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

clean_corp <- clean_corpus(mydata_corpus)

# Print out a cleaned up text
clean_corp[[1]][1]

# Print out the same text in original form
mydata_corpus[[1]][1]

# The RWeka package is already loaded
library(RWeka)
# Define bigram tokenizer
tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))

# Create tdm and matrix with tf-idf weighting
#  control = list(weighting = weightTfIdf, tokenize = tokenizer)
tf_idf_dtm <- DocumentTermMatrix(
  clean_corp, 
  #control = list(tokenize = tokenizer)
  list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE)
  )
  
tf_idf_dtm_m <- as.matrix(tf_idf_dtm)

#tf_idf_dtm_m[order(tf_idf_dtm_m$1),]

#tf_idf_dtm_m.sorted=tf_idf_dtm_m[order(tf_idf_dtm_m[,2], decreasing = TRUE),]

# Print the dimensions of the matrix
dim(tf_idf_dtm_m)

tf_idf_dtm_m[1:3, 1:15]

#tf_idf_dtm_m.sorted[1:20, 1:6]

CATEGORY.factor = as.factor(mydata$CATEGORY)

tf_idf_dtm_m2 <- cbind(tf_idf_dtm_m, c(0, 1, 2) )
colnames(tf_idf_dtm_m2)[ncol(tf_idf_dtm_m2)] <- 'CATEGORY'
tf_idf_dtm_m3 <- as.data.frame(tf_idf_dtm_m2)
tf_idf_dtm_m3$CATEGORY <- as.factor(tf_idf_dtm_m3$CATEGORY)

library(caret)

# Train.
# list of all available methods: names(getModelInfo())
# http://topepo.github.io/caret/train-models-by-tag.html
# Bayesian Generalized Linear Model
fit <- train(CATEGORY ~ ., data = tf_idf_dtm_m3, method = 'bayesglm')

this code raise warnings:

Warning messages:
1: predictions failed for Resample03: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

2: predictions failed for Resample10: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

3: predictions failed for Resample15: parameter=none Error in family(object)$linkinv(pred) :
Argument eta must be a nonempty numeric vector

4: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.

and cant classify words correctly:

# Test data.
test_data <- c('заяц')
corpus <- VCorpus(VectorSource(test_data))
test_dtm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tf_idf_dtm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test_matrix <- as.matrix(test_dtm)

# Check accuracy on test.
result = predict(fit, newdata = test_matrix)
#result
#mydata$CATEGORY[result]

paste("type of '",test_data,"' word is: '",mydata$CATEGORY[result],"'", sep="")

Questions:

Why warnings appears, whats wrong with code?
Why prediction returns incorrect result (because "заяц" word is included only in one category = "животные", but code returns "растения" category )?

system · October 31, 2019, 2:11pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.