Predicting the Speed of my Bike Rides

There is a new package in the tidyverse ecosystem for modeling. Like the tidyverse, the new tidymodels package is a collection of packages. It follows the same basic underlying principles of the tidyverse package, but the central topic of this collection of packages is modeling. Since it’s not around for long, I decided to give it a go on my own datasets. As a rough guideline I followed the great blog posts from Julia Silge’s blog.

Load the packages needed for this post:

library(stravamodel)
library(tidyverse)
library(knitr)
library(drake)
library(here)
library(fs)

theme_set(theme_light())

Data

Get all bike rides from my private Github Repository. Select relevant columns and parse start_date_local into a timestamp (start_timestamp_local). For some rides there is no gear_id. This is because for some old rides, I didn’t maintain this information. For these cases add ‘Endurace’ as gear. Deselect the no longer needed gear_id and start_date_local columns.

bike_rides <- function() {
  pins::board_register_github(repo = "duju211/strava_data")

  df_act_raw <- pins::pin_get("strava_activities", board = "github")

  pins::board_disconnect("github")

  rel_vars <- c(
    "id", "distance", "elapsed_time", "moving_time", "total_elevation_gain",
    "start_date_local", "gear_id", "average_speed", "max_speed",
    "average_temp", "elev_high", "elev_low", "pr_count")

  df_act_bike <- df_act_raw %>%
    dplyr::filter(type == "Ride") %>%
    dplyr::select(dplyr::one_of(rel_vars)) %>%
    dplyr::mutate(
      start_timestamp_local = lubridate::ymd_hms(start_date_local),
      gear = dplyr::case_when(
        gear_id == "b6601295" ~ "Aeroad",
        gear_id == "b4797231" ~ "Endurace",
        TRUE ~ "Endurace")) %>%
    dplyr::select(-c(start_date_local, gear_id))
}

Save the output of the function in the df_bike_rides data frame and take a quick look at the scatter plot matrix produced with the GGally package:

vis_bike_rides_eda <- function(df_bike_rides) {
  df_bike_rides %>%
    dplyr::select(-c(id, pr_count)) %>%
    GGally::ggscatmat(color = "gear", alpha = 0.6) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}

Modelling

Create resamples of the data set:

init_split <- rsample::initial_split(df_bike_rides)

df_train <- rsample::training(init_split)
df_test <- rsample::testing(init_split)

df_resamples <- rsample::bootstraps(df_train)

Define the random forest model. Set the mode of the model to ‘regression’ and define mtry and min_n as tuning parameters.

rand_forest_model <- function() {
  cores <- parallel::detectCores()

  rf_mod <- parsnip::rand_forest(trees = 1000, mtry = tune(),
                                 min_n = tune()) %>%
    parsnip::set_engine(
      "ranger", num.threads = cores, importance = "impurity") %>%
    parsnip::set_mode("regression")
}

Define the preprocessing steps for the random forest model:

rand_forest_rec <- function(df_train) {
  strava_rec <- recipes::recipe(average_speed ~ ., data = df_train) %>%
    recipes::update_role(id, new_role = "ID") %>%
    recipes::step_rm(start_timestamp_local) %>%
    recipes::step_medianimpute(average_temp, elev_high, elev_low)
}

Combine the model and the preprocessing definition into one ‘workflow’

rand_forest_workflow <- function(rf_rec, rf_model) {
  workflows::workflow() %>%
    workflows::add_recipe(rf_rec) %>%
    workflows::add_model(rf_model)
}

Tune the workflow:

rand_forest_tune <- function(rf_workflow, df_resamples) {
  tune::tune_grid(
    rf_workflow, df_resamples,
    control = tune::control_grid(save_pred = TRUE))
}

Take a look at the result of the tuning process:

plot_tuning_results <- function(tune_res) {
  tune_res %>%
    tune::collect_metrics() %>%
    tidyr::pivot_longer(
      cols = c(mtry, min_n), names_to = "parameter", values_to = "value") %>%
    ggplot2::ggplot(ggplot2::aes(x = value, y = mean, color = .metric)) +
    ggplot2::geom_point() +
    ggplot2::facet_wrap(~ parameter, scales = "free_x")
}

Select the best combination of tuning parameters and finalise the workflow with these parameters:

best_rmse <- tune::select_best(rf_tune_res, metric = "rmse")
final_rf <- tune::finalize_workflow(rf_workflow, best_rmse)

The final workflow looks like this:

## == Workflow ==========================================================================================================================================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor --------------------------------------------------------------------------------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_rm()
## * step_medianimpute()
## 
## -- Model ---------------------------------------------------------------------------------------------------------------------------------------------
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 3
##   trees = 1000
##   min_n = 3
## 
## Engine-Specific Arguments:
##   num.threads = cores
##   importance = impurity
## 
## Computational engine: ranger

Visualise the variable importance:

vis_vip <- function(final_rf, df_train) {
  final_rf %>%
    parsnip::fit(data = df_train) %>%
    workflows::pull_workflow_fit() %>%
    vip::vip(geom = "point")
}
readd(gg_vip, path = path_to_drake)