Hi All,
I'm in the early stages of my machine learning journey and need some help with a model I'm working on. I am trying to predict hourly bike rentals using a dataset from UCI Repository. The feature I am trying to predict is the "rented_count". I am testing a GLMNET Model and the best MAE from by cross validation is 371.92. I know there are more powerful models to explore, however I wanted to explore if there are ways to improve my current model. Are there additional preprocessing steps I can do to improve my current model?
Below is my code. Here are the steps in the preprocessing steps I did -
- Create a "month" feature from the "date"
- Create a "day of the week" feature from the "date"
- Create a "weekend/weekday" feature from the "date"
I noticed several features are skewed in the dataset (rented_count, dew_point, windspeed, etc). I tried including a "step_YeoJohnson" which I understand can be used to transform skewed data, however I was not sure how to interpret the MAE metric as it was also in the transformed format.
Please see below for my code. Any help will be appreciated.
# Load libraries
library(tidyverse)
library(tidymodels)
library(lubridate)
# Load dataset
seoul_bikes_data_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv"
colnames = c("date", "rented_count", "hour", "temp", "humidity", "windspeed", "visibility", "dew_point", "solar_rad",
"rainfall", "snowfall", "season", "holiday", "functional_day")
seoul_bikes_tbl = read.csv(seoul_bikes_data_path, header = F, skip = 1) %>%
as_tibble()
colnames(seoul_bikes_tbl) = colnames
seoul_bikes_tbl2 <- seoul_bikes_tbl %>%
# PREPROCESSING STEPS
# Change date from chr to date format
mutate(date = dmy(date)) %>%
# Get the month day and year from the date
mutate(month = month(date, label = T)) %>%
mutate(year = year(date)) %>%
mutate(day = wday(date, label = T)) %>%
# Create a month feature such ase "Dec - 2017"
mutate(month = str_glue("{month} - {year}")) %>%
# Convert chr features to factors and relevel factors for the "month" feature
mutate(month = as.character(month)) %>%
mutate_if(is.character, as.factor) %>%
mutate(day = as.character(day)) %>%
mutate(day = as.factor(day)) %>%
mutate(month = month %>% fct_relevel("Dec - 2017", "Jan - 2018", "Feb - 2018", "Mar - 2018", "Apr - 2018", "May - 2018",
"Jun - 2018", "Jul - 2018", "Aug - 2018", "Sep - 2018", "Oct - 2018", "Nov - 2018")) %>%
mutate(hour = as.factor(hour)) %>%
# Remove unwanted columns
select(-date, -year) %>%
mutate(weekend = case_when(
day == "Sat" | day == "Sun" ~ "Weekend", TRUE ~ "Weekday"
)) %>%
# Change "weekend" feature to factor
mutate(weekend = as.factor(weekend))
# MODELLING
# Split Data
set.seed(100)
seoul_initial_split <- initial_split(seoul_bikes_tbl2, prop = 0.80)
seoul_train_tbl = training(seoul_initial_split)
seoul_test_tbl = testing(seoul_initial_split)
# Cross Validation Specs
set.seed(101)
seoul_folds <- vfold_cv(seoul_train_tbl, v = 10, repeats = 2)
# Recipe
seoul_recipe = recipe(rented_count ~ ., data = seoul_train_tbl) %>%
step_zv(all_predictors()) %>%
step_dummy(all_nominal()) %>%
prep()
# Glmnet Model Spec
seoul_glmnet_model <- linear_reg(mode = "regression",
penalty = tune(),
mixture = tune()) %>%
set_engine("glmnet")
# Workflow
seoul_glmnet_wf <-
workflow() %>%
add_model(seoul_glmnet_model) %>%
add_recipe(seoul_recipe)
# Create tuning grid
seoul_glmnet_params <- parameters(penalty(), mixture())
set.seed(103)
seoul_glmnet_tune_grid <- grid_max_entropy(seoul_glmnet_params, size = 20)
# Tune Hyper-parameters
seoul_glmnet_tune_results <-
tune_grid(object = seoul_glmnet_wf,
resamples = seoul_folds,
grid = seoul_glmnet_tune_grid,
metrics = metric_set(mae, mape, rmse, rsq),
control = control_grid(verbose = TRUE)
)
# Show Best MAE
seoul_glmnet_tune_results %>% show_best("mae", n = 10, maximize = FALSE)