There is a new package in the tidyverse
ecosystem for modeling.
Like the tidyverse
, the new tidymodels
package is a collection of packages.
It follows the same basic underlying principles of the tidyverse
package,
but the central topic of this collection of packages is modeling.
Since it’s not
around for long, I decided to give it a go on my own datasets. As a rough
guideline I followed the great blog posts from
Julia Silge’s blog.
Load the packages needed for this post:
library(stravamodel)
library(tidyverse)
library(knitr)
library(drake)
library(here)
library(fs)
theme_set(theme_light())
Get all bike rides from my private Github Repository. Select relevant
columns and parse start_date_local
into a timestamp (start_timestamp_local
).
For some rides there is no gear_id
.
This is because for some old rides, I didn’t maintain this information.
For these cases add ‘Endurace’ as gear
.
Deselect the no longer needed gear_id
and start_date_local
columns.
bike_rides <- function() {
pins::board_register_github(repo = "duju211/strava_data")
df_act_raw <- pins::pin_get("strava_activities", board = "github")
pins::board_disconnect("github")
rel_vars <- c(
"id", "distance", "elapsed_time", "moving_time", "total_elevation_gain",
"start_date_local", "gear_id", "average_speed", "max_speed",
"average_temp", "elev_high", "elev_low", "pr_count")
df_act_bike <- df_act_raw %>%
dplyr::filter(type == "Ride") %>%
dplyr::select(dplyr::one_of(rel_vars)) %>%
dplyr::mutate(
start_timestamp_local = lubridate::ymd_hms(start_date_local),
gear = dplyr::case_when(
gear_id == "b6601295" ~ "Aeroad",
gear_id == "b4797231" ~ "Endurace",
TRUE ~ "Endurace")) %>%
dplyr::select(-c(start_date_local, gear_id))
}
Save the output of the function in the df_bike_rides
data frame and take
a quick look at the scatter plot matrix produced with the GGally
package:
vis_bike_rides_eda <- function(df_bike_rides) {
df_bike_rides %>%
dplyr::select(-c(id, pr_count)) %>%
GGally::ggscatmat(color = "gear", alpha = 0.6) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}
Create resamples of the data set:
init_split <- rsample::initial_split(df_bike_rides)
df_train <- rsample::training(init_split)
df_test <- rsample::testing(init_split)
df_resamples <- rsample::bootstraps(df_train)
Define the random forest model. Set the mode of the model to ‘regression’
and define mtry
and min_n
as tuning parameters.
rand_forest_model <- function() {
cores <- parallel::detectCores()
rf_mod <- parsnip::rand_forest(trees = 1000, mtry = tune(),
min_n = tune()) %>%
parsnip::set_engine(
"ranger", num.threads = cores, importance = "impurity") %>%
parsnip::set_mode("regression")
}
Define the preprocessing steps for the random forest model:
rand_forest_rec <- function(df_train) {
strava_rec <- recipes::recipe(average_speed ~ ., data = df_train) %>%
recipes::update_role(id, new_role = "ID") %>%
recipes::step_rm(start_timestamp_local) %>%
recipes::step_medianimpute(average_temp, elev_high, elev_low)
}
Combine the model and the preprocessing definition into one ‘workflow’
rand_forest_workflow <- function(rf_rec, rf_model) {
workflows::workflow() %>%
workflows::add_recipe(rf_rec) %>%
workflows::add_model(rf_model)
}
Tune the workflow:
rand_forest_tune <- function(rf_workflow, df_resamples) {
tune::tune_grid(
rf_workflow, df_resamples,
control = tune::control_grid(save_pred = TRUE))
}
Take a look at the result of the tuning process:
plot_tuning_results <- function(tune_res) {
tune_res %>%
tune::collect_metrics() %>%
tidyr::pivot_longer(
cols = c(mtry, min_n), names_to = "parameter", values_to = "value") %>%
ggplot2::ggplot(ggplot2::aes(x = value, y = mean, color = .metric)) +
ggplot2::geom_point() +
ggplot2::facet_wrap(~ parameter, scales = "free_x")
}
Select the best combination of tuning parameters and finalise the workflow with these parameters:
best_rmse <- tune::select_best(rf_tune_res, metric = "rmse")
final_rf <- tune::finalize_workflow(rf_workflow, best_rmse)
The final workflow looks like this:
## == Workflow ==========================================================================================================================================
## Preprocessor: Recipe
## Model: rand_forest()
##
## -- Preprocessor --------------------------------------------------------------------------------------------------------------------------------------
## 2 Recipe Steps
##
## * step_rm()
## * step_medianimpute()
##
## -- Model ---------------------------------------------------------------------------------------------------------------------------------------------
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 3
## trees = 1000
## min_n = 3
##
## Engine-Specific Arguments:
## num.threads = cores
## importance = impurity
##
## Computational engine: ranger
Visualise the variable importance:
vis_vip <- function(final_rf, df_train) {
final_rf %>%
parsnip::fit(data = df_train) %>%
workflows::pull_workflow_fit() %>%
vip::vip(geom = "point")
}
readd(gg_vip, path = path_to_drake)