Predicting on a test set document-term matrix with different columns

bayesian · December 26, 2020, 11:37am

I am trying my hand on categorising sentences into Product category using a RF model. The dataset (US consumer complaints; smaller datasets for reprex here) is in my instance provided separately for train and test set, precluding the use of rsample::initial_split().

Clumsily or not, I managed to successfully tokenize and manipulate the training data into a document-term matrix for fitting in tidymodels. I repeated the steps on my test set before using predict(). But because the content of the complaint sentences are different, the tokenized words and dtm columns are different, resulting in this error:

Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...

How can I go about it?

library(tidyverse)
library(tidymodels)
library(tidytext)
library(tm) 

# Import data
trainsmall_tbl <- readr::read_csv("complaints_train.csv")
test_org <- readr::read_csv("complaints_test.csv")

# Take only Product and Consumer.complaint.narrative
trainsmall_tbl <- trainsmall_tbl %>% 
  select(Product, Consumer.complaint.narrative) %>% 
  mutate(complaint_id = row_number()) 

# Add numeric labels for Product
product_numlevels <- tibble(
                      product_label = c(1:4), 
                      Product = c("Mortgage", "Student loan", 
                                  "Credit card or prepaid card", 
                                  "Vehicle loan or lease")   )

trainsmall_tbl <- trainsmall_tbl %>% left_join(product_numlevels)

#head(trainsmall_tbl)

# Add labels to product_label
trainsmall_tbl$product_label <- factor(trainsmall_tbl$product_label,
                                       levels = c(1, 2, 3, 4), 
                                       labels = c("Mortgage",
                                                  "Student loan",
                                                  "Credit card or prepaid card",
                                                  "Vehicle loan or lease"))

#### Create tidytext data frame -----------------------
## Tokenize and clean --------
train_tokens <- trainsmall_tbl %>% 
  unnest_tokens(output = word, input = Consumer.complaint.narrative) %>% 
  filter(!str_detect(word, "[[:punct:]]")) %>%                  # remove punc 
  filter(!str_detect(word, "^[0-9]*$")) %>%                   # remove numbers
  filter(!str_detect(word, "xx")) %>%                               # remove contains "xx"
  anti_join(tidytext::stop_words) %>%                           # remove stop words
  mutate(word = SnowballC::wordStem(word)) %>%        # stem the words
  mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% # remove whitespace
  mutate_at(.vars = c("word"), .funs = funs(tolower))                # to lowercase

#head(train_tokens)

#### Create document-term matrix (dtm) ---------------
train_dtm <- train_tokens %>% 
  count(complaint_id, word) %>% 
  tidytext::cast_dtm(document = complaint_id, term = word, value = n)

## Remove sparse terms 
train_dtm <- tm::removeSparseTerms(train_dtm, sparse = .99)

## Convert dtm to matrix, then df/tibble
train_mat <- as.matrix(train_dtm)
train_df <- as.data.frame(cbind(trainsmall_tbl$product_label, train_mat))
train_tbl <- as_tibble(train_df)

## Convert product labels to factor again -----
train_tbl$V1 <- as.factor(train_tbl$V1)

train_tbl$V1 <- factor(train_tbl$V1,
                      levels = c(1, 2, 3, 4), 
                      labels = c("Mortgage",
                                 "Student loan",
                                 "Credit card or prepaid card",
                                 "Vehicle loan or lease"))

# Convert `V1` (`product_label`) to `dv`
names(train_tbl)[1] <- "dv"

head(train_tbl)

# A tibble: 6 x 877
  dv      ago appli attach bought breach  cfpb charg check claim complaint continu contract
  <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl>    <dbl>
1 Mort~     1     1      3      1      1     1     2     2     1         2       1        1
2 Cred~     0     0      0      0      0     0     0     0     0         0       0        0
3 Stud~     0     0      0      0      0     0     0     1     0         0       0        2
4 Stud~     0     0      0      0      0     0     0     0     0         0       0        0
5 Mort~     0     0      0      0      0     0     0     0     0         0       0        0
6 Mort~     0     0      0      0      0     0     0     0     0         0       0        0

tidymodels

## Make a recipe
simple_rec <- train_tbl %>% recipes::recipe(dv ~ .)

## Specifying the model
# Specified hyperparameters
rf_model <- parsnip::rand_forest(mtry = 65, min_n = 30) %>% 
            parsnip::set_engine("randomForest") %>% 
            parsnip::set_mode("classification")

## Create workflow
rf_wflow <- workflows::workflow() %>% 
            workflows::add_recipe(simple_rec) %>% 
            workflows::add_model(rf_model)

## Fit data to model --------
rf_wflow_fit <- parsnip::fit(rf_wflow, data = train_tbl)

Prepare test set for predict()?

# Take only Consumer.complaint.narrative (Product not available)
test_tbl <- test_org %>% 
  select(Consumer.complaint.narrative) %>%  
  mutate(complaint_id = row_number())  

# Tokenizing and cleaning of tokens
test_tokens <- test_tbl %>% 
  unnest_tokens(output = word, input = Consumer.complaint.narrative) %>% 
  filter(!str_detect(word, "[[:punct:]]")) %>%               
  filter(!str_detect(word, "^[0-9]*$")) %>%                
  filter(!str_detect(word, "xx")) %>%                      
  anti_join(tidytext::stop_words) %>%                   
  mutate(word = SnowballC::wordStem(word)) %>%       
  mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% 
  mutate_at(.vars = c("word"), .funs = funs(tolower))   

## Create document matrix
test_dtm <- test_tokens %>% 
  count(complaint_id, word) %>% 
  tidytext::cast_dtm(document = complaint_id, term = word, value = n)

## Remove sparse terms
test_dtm <- tm::removeSparseTerms(test_dtm, sparse = .99)

## Convert dtm to matrix, then to tibble
test_mat <- as.matrix(test_dtm)
test_tbl <- as_tibble(test_mat) 

names(test_tbl)[1:5] # [1]  "contact" "credit"  "defer"   "due"     "ecmc"

Predict on test set?

predict(final_rf_wflow_fit, test_tbl)

Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...

nirgrahamuk · December 26, 2020, 8:30pm

there might be a more parsnip/tidymodels way of making a pipeline to do this more elegantly, but I havent studied that, so I rely on simple use of the training model words stored as a factor, and applied as a mask over the tokens found in test, so as to impose commonality.

library(tidyverse)
library(tidymodels)
library(tidytext)
library(tm) 

# Import data
trainsmall_tbl <- readr::read_csv("complaints_train.csv")
test_org <- readr::read_csv("complaints_test.csv")

# Take only Product and Consumer.complaint.narrative
trainsmall_tbl <- trainsmall_tbl %>% 
  select(Product, Consumer.complaint.narrative) %>% 
  mutate(complaint_id = row_number()) 

# Add numeric labels for Product
product_numlevels <- tibble(
  product_label = c(1:4), 
  Product = c("Mortgage", "Student loan", 
              "Credit card or prepaid card", 
              "Vehicle loan or lease")   )

trainsmall_tbl <- trainsmall_tbl %>% left_join(product_numlevels)

#head(trainsmall_tbl)

# Add labels to product_label
trainsmall_tbl$product_label <- factor(trainsmall_tbl$product_label,
                                       levels = c(1, 2, 3, 4), 
                                       labels = c("Mortgage",
                                                  "Student loan",
                                                  "Credit card or prepaid card",
                                                  "Vehicle loan or lease"))

#### Create tidytext data frame -----------------------
## Tokenize and clean --------
train_tokens <- trainsmall_tbl %>% 
  unnest_tokens(output = word, input = Consumer.complaint.narrative) %>% 
  filter(!str_detect(word, "[[:punct:]]")) %>%                  # remove punc 
  filter(!str_detect(word, "^[0-9]*$")) %>%                   # remove numbers
  filter(!str_detect(word, "xx")) %>%                               # remove contains "xx"
  anti_join(tidytext::stop_words) %>%                           # remove stop words
  mutate(word = SnowballC::wordStem(word)) %>%        # stem the words
  mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% # remove whitespace
  mutate_at(.vars = c("word"), .funs = funs(tolower))                # to lowercase

#head(train_tokens)

#### Create document-term matrix (dtm) ---------------
train_dtm <- train_tokens %>% 
  count(complaint_id, word) %>% 
  tidytext::cast_dtm(document = complaint_id, term = word, value = n)

## Remove sparse terms 
(train_dtm <- tm::removeSparseTerms(train_dtm, sparse = .95))

## Convert dtm to matrix, then df/tibble
train_mat <- as.matrix(train_dtm)
train_df <- as.data.frame(cbind(trainsmall_tbl$product_label, train_mat))
train_tbl <- as_tibble(train_df)

## Convert product labels to factor again -----
train_tbl$V1 <- as.factor(train_tbl$V1)

train_tbl$V1 <- factor(train_tbl$V1,
                       levels = c(1, 2, 3, 4), 
                       labels = c("Mortgage",
                                  "Student loan",
                                  "Credit card or prepaid card",
                                  "Vehicle loan or lease"))

# Convert `V1` (`product_label`) to `dv`
names(train_tbl)[1] <- "dv"

head(train_tbl)

model_words <- names(train_tbl) %>% as_factor()

## Make a recipe
simple_rec <- train_tbl %>% recipes::recipe(dv ~ .)

## Specifying the model
# Specified hyperparameters
rf_model <- parsnip::rand_forest( min_n = 10) %>% 
  parsnip::set_engine("randomForest") %>% 
  parsnip::set_mode("classification")

## Create workflow
rf_wflow <- workflows::workflow() %>% 
  workflows::add_recipe(simple_rec) %>% 
  workflows::add_model(rf_model)

## Fit data to model --------
rf_wflow_fit <- parsnip::fit(rf_wflow, data = train_tbl)


# Take only Consumer.complaint.narrative (Product not available)
test_tbl <- test_org %>% 
  select(Consumer.complaint.narrative) %>%  
  mutate(complaint_id = row_number())  

# Tokenizing and cleaning of tokens
test_tokens <- test_tbl %>% 
  unnest_tokens(output = word, input = Consumer.complaint.narrative) %>% 
  filter(!str_detect(word, "[[:punct:]]")) %>%               
  filter(!str_detect(word, "^[0-9]*$")) %>%                
  filter(!str_detect(word, "xx")) %>%                      
  anti_join(tidytext::stop_words) %>%                   
  mutate(word = SnowballC::wordStem(word)) %>%       
  mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% 
  mutate_at(.vars = c("word"), .funs = funs(tolower))   

test_tokens$word <- factor(test_tokens$word,
                      levels=levels(model_words))
## Create document matrix
test_dtm <- test_tokens %>% 
  count(complaint_id, word,.drop = FALSE) %>% filter(!is.na(word)) %>% 
  tidytext::cast_dtm(document = complaint_id, term = word, value = n)

## Convert dtm to matrix, then to tibble
test_mat <- as.matrix(test_dtm)
test_tbl <- as_tibble(test_mat) 

predict(rf_wflow_fit, test_tbl)

bayesian · December 27, 2020, 3:33pm

Interesting method! This does the job. Thanks and happy holidays

system · January 3, 2021, 3:33pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.