I am trying my hand on categorising sentences into Product category using a RF model. The dataset (US consumer complaints; smaller datasets for reprex here) is in my instance provided separately for train and test set, precluding the use of rsample::initial_split().
Clumsily or not, I managed to successfully tokenize and manipulate the training data into a document-term matrix for fitting in tidymodels. I repeated the steps on my test set before using predict(). But because the content of the complaint sentences are different, the tokenized words and dtm columns are different, resulting in this error:
Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...
How can I go about it?
library(tidyverse)
library(tidymodels)
library(tidytext)
library(tm)
# Import data
trainsmall_tbl <- readr::read_csv("complaints_train.csv")
test_org <- readr::read_csv("complaints_test.csv")
# Take only Product and Consumer.complaint.narrative
trainsmall_tbl <- trainsmall_tbl %>%
select(Product, Consumer.complaint.narrative) %>%
mutate(complaint_id = row_number())
# Add numeric labels for Product
product_numlevels <- tibble(
product_label = c(1:4),
Product = c("Mortgage", "Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease") )
trainsmall_tbl <- trainsmall_tbl %>% left_join(product_numlevels)
#head(trainsmall_tbl)
# Add labels to product_label
trainsmall_tbl$product_label <- factor(trainsmall_tbl$product_label,
levels = c(1, 2, 3, 4),
labels = c("Mortgage",
"Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease"))
#### Create tidytext data frame -----------------------
## Tokenize and clean --------
train_tokens <- trainsmall_tbl %>%
unnest_tokens(output = word, input = Consumer.complaint.narrative) %>%
filter(!str_detect(word, "[[:punct:]]")) %>% # remove punc
filter(!str_detect(word, "^[0-9]*$")) %>% # remove numbers
filter(!str_detect(word, "xx")) %>% # remove contains "xx"
anti_join(tidytext::stop_words) %>% # remove stop words
mutate(word = SnowballC::wordStem(word)) %>% # stem the words
mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% # remove whitespace
mutate_at(.vars = c("word"), .funs = funs(tolower)) # to lowercase
#head(train_tokens)
#### Create document-term matrix (dtm) ---------------
train_dtm <- train_tokens %>%
count(complaint_id, word) %>%
tidytext::cast_dtm(document = complaint_id, term = word, value = n)
## Remove sparse terms
train_dtm <- tm::removeSparseTerms(train_dtm, sparse = .99)
## Convert dtm to matrix, then df/tibble
train_mat <- as.matrix(train_dtm)
train_df <- as.data.frame(cbind(trainsmall_tbl$product_label, train_mat))
train_tbl <- as_tibble(train_df)
## Convert product labels to factor again -----
train_tbl$V1 <- as.factor(train_tbl$V1)
train_tbl$V1 <- factor(train_tbl$V1,
levels = c(1, 2, 3, 4),
labels = c("Mortgage",
"Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease"))
# Convert `V1` (`product_label`) to `dv`
names(train_tbl)[1] <- "dv"
head(train_tbl)
# A tibble: 6 x 877
dv ago appli attach bought breach cfpb charg check claim complaint continu contract
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Mort~ 1 1 3 1 1 1 2 2 1 2 1 1
2 Cred~ 0 0 0 0 0 0 0 0 0 0 0 0
3 Stud~ 0 0 0 0 0 0 0 1 0 0 0 2
4 Stud~ 0 0 0 0 0 0 0 0 0 0 0 0
5 Mort~ 0 0 0 0 0 0 0 0 0 0 0 0
6 Mort~ 0 0 0 0 0 0 0 0 0 0 0 0
tidymodels
## Make a recipe
simple_rec <- train_tbl %>% recipes::recipe(dv ~ .)
## Specifying the model
# Specified hyperparameters
rf_model <- parsnip::rand_forest(mtry = 65, min_n = 30) %>%
parsnip::set_engine("randomForest") %>%
parsnip::set_mode("classification")
## Create workflow
rf_wflow <- workflows::workflow() %>%
workflows::add_recipe(simple_rec) %>%
workflows::add_model(rf_model)
## Fit data to model --------
rf_wflow_fit <- parsnip::fit(rf_wflow, data = train_tbl)
Prepare test set for predict()?
# Take only Consumer.complaint.narrative (Product not available)
test_tbl <- test_org %>%
select(Consumer.complaint.narrative) %>%
mutate(complaint_id = row_number())
# Tokenizing and cleaning of tokens
test_tokens <- test_tbl %>%
unnest_tokens(output = word, input = Consumer.complaint.narrative) %>%
filter(!str_detect(word, "[[:punct:]]")) %>%
filter(!str_detect(word, "^[0-9]*$")) %>%
filter(!str_detect(word, "xx")) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word)) %>%
mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>%
mutate_at(.vars = c("word"), .funs = funs(tolower))
## Create document matrix
test_dtm <- test_tokens %>%
count(complaint_id, word) %>%
tidytext::cast_dtm(document = complaint_id, term = word, value = n)
## Remove sparse terms
test_dtm <- tm::removeSparseTerms(test_dtm, sparse = .99)
## Convert dtm to matrix, then to tibble
test_mat <- as.matrix(test_dtm)
test_tbl <- as_tibble(test_mat)
names(test_tbl)[1:5] # [1] "contact" "credit" "defer" "due" "ecmc"
Predict on test set?
predict(final_rf_wflow_fit, test_tbl)
Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...