How to get a random forest model by group of variables in R?


The dataset contains key variables (year, month and country), and the predictors are many (in the real dataset).
Let´s say that a and b are part of the group of climate variables and c and d the violent[enter image description here][1] group. The aim is to get metrics by "climate" and "violent"
I want to get the RMSE by a group of variables but not by each variable. Still, I am keeping getting results by variable...

Source of the picture: article

countries <- data.frame(
  expand.grid(country = c("Angola", "South Sudan", "Namibia"), year = 2006:2019, month = 1:12),
  deaths = round(runif(9, 1000, 20000), 0),
  a = c(6, 7, 4),
  b = c(5, 8, 9),
  c = c(2, 20, 80),
  d = c(100, 300, 500)
)

#Let´s say that "a" and "b" are part of the group of climate variables
# and "c" and "d" the violent group

#My Key ids are country, year and month


##Open libraries
library(tidymodels)
library(parsnip)
library(forcats)
library(ranger)
library(baguette)
library(lubridate)
library(ranger)
library(DALEX)
library(rlang)
library(future)
###########################################################
set.seed(123)
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(countries)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=3)
###########################################################

##Produce the recipe

#getting errors with "rec"
rec <- recipe(deaths~ ., data = countries) %>% 
  step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
  step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
  step_impute_median()(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
  step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

#OR?


my_recipe <- train_data%>%
  recipe(deaths~ .) %>%
  add_role(a,b, new_role = "climate") %>%
  add_role(c, d, new_role = "conflict") %>%
  #step_rm(iso3c, year, month) %>%
  #step_dummy(all_nominal()) %>% 
  step_novel(all_nominal()) %>% 
  step_impute_median((all_numeric())) %>%
  step_string2factor(all_nominal())%>%
  prep()

my_recipe

###################################################################################


###################################################
##Random forests
###################################################

mod_rf <-rand_forest(trees = 1e3) %>%
  set_engine("ranger",
             num.threads = parallel::detectCores(), 
             importance = "permutation", 
             verbose = TRUE) %>% 
  set_mode("regression") 

##Create Workflow

wflow_rf <- workflow() %>% 
  add_model(mod_rf) %>% 
  add_recipe(my_recipe)

##Fit the model

plan(multisession)

fit_rf<-fit_resamples(
  wflow_rf,
  cv,
  metrics = metric_set(rmse, rsq),
  control = control_resamples(save_pred = TRUE,
                              extract = function(x) extract_model(x)))


# extract roots
rf_tree_roots <- function(x){
  map_chr(1:1000, 
          ~ranger::treeInfo(x, tree = .)[1, "splitvarName"])
}

rf_roots <- function(x){
  x %>% 
    select(.extracts) %>% 
    unnest(cols = c(.extracts)) %>% 
    mutate(oob_rmse = map_dbl(.extracts,
                              ~sqrt(.x$prediction.error)),
           roots = map(.extracts, 
                       ~rf_tree_roots(.))
    ) %>% 
    dplyr::select(roots) %>% 
    unnest(cols = c(roots))
}

#

# plot
rf_roots(fit_rf) %>% 
  group_by(roots) %>% 
  count() %>% 
  dplyr::arrange(desc(n)) %>% 
  dplyr::filter(n > 75) %>% 
  ggplot(aes(fct_reorder(roots, n), n)) +
  geom_col() + 
  coord_flip() + 
  labs(x = "root", y = "count")

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.