I was trying to build a two-category text classification model using random forests, with author
being resposne and various word usage as predictors.
library(tidyverse)
library(tidymodels)
library(gutenbergr)
library(tidytext)
titles <- c(
"Wuthering Heights",
"Jane Eyre: An Autobiography"
)
books <- gutenberg_works() %>%
filter(title %in% titles) %>%
gutenberg_download(meta_fields = "title") %>%
transmute(author = if_else(title == "Jane Eyre: An Autobiography",
"Charlotte Brontë",
"Emily Brontë") %>% factor(),
line_index = row_number(),
text)
tidy_books <- books %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "^\\d+$")) %>%
mutate(word = str_remove_all(word, "_")) %>%
group_by(word) %>%
filter(n() > 40) %>%
ungroup() %>%
mutate(word = str_remove(word, "'s"))
model_df <- tidy_books %>%
count(line_index, word) %>%
pivot_wider(names_from = word, values_from = n,
values_fill = list(n = 0)) %>%
left_join(books) %>%
select(-text)
book_split <- initial_split(model_df)
book_train <- training(book_split)
book_test <- testing(book_split)
book_folds <- vfold_cv(book_train)
rf_spec <- rand_forest(mtry = tune(), trees = 500, min_n = 10) %>%
set_engine("ranger") %>%
set_mode("classification")
rf_rec <- recipe(author ~ ., data = book_train) %>%
update_role(line_index, new_role = "ID")
rf_wf_tune <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(rf_rec)
mtry_grid <- grid_regular(finalize(mtry(), book_train),
levels = 10)
rf_results <- tune_grid(rf_wf_tune,
resamples = book_folds,
grid = mtry_grid)
#> All models failed in tune_grid(). See the .notes column
The .notes
column contains message like 'model 1/10: Error in parse.formula(formula, data, env = parent.frame()): Error: Illegal column names in formula interface. Fix column names or use alternative interface in ranger.'
I suspect this is becuase I have some illegal column names? But I found no such in colnames(book_train)
colnames(book_train) %>%
str_subset("^(\\d|[:punct:])")
#> character(0)