Hyperparameter Tuning in R

prediction for binary model

R
model
Author

Tony Duan

Published

March 10, 2024

1 load package

Load Pacakges & Set Options
library(tidyverse)      
library(tidymodels)     
library(palmerpenguins) # penguin dataset
library(gt)             # better tables
library(bonsai)         # tree-based models
library(conflicted)     # function conflicts
tidymodels_prefer()     # handle conflicts
conflict_prefer("penguins", "palmerpenguins")

2 data clean and split

Code
penguins |>
  filter(!is.na(sex)) |>
  ggplot(aes(x     = flipper_length_mm,
             y     = bill_length_mm,
             color = sex,
             size  = body_mass_g)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~species)

Prepare & Split Data
# remove rows with missing sex
# exclude year and island
penguins_df <-
  penguins |>
  drop_na(sex) |>
  select(-year, -island)

# set the seed for reproducibility
set.seed(1234)

# Split the data into train and test sets
# stratify by sex
penguin_split <- initial_split(penguins_df,
                               strata = sex)
penguin_train <- training(penguin_split)
penguin_test  <- testing(penguin_split)

# create folds for cross validation
penguin_folds <- vfold_cv(penguin_train, v = 10, strata = sex)

3 Create Recipe

Code
penguin_rec <-
  recipe(sex ~ ., data = penguin_train) |>    
  step_dummy(species)

4 Specify Model

Code
rlang::check_installed("lightgbm")
bt_bonsai_spec <-
  boost_tree(learn_rate     = tune(),
             stop_iter      = tune(),
             trees          = 100) |>
  set_engine(engine     = "lightgbm",
             num_leaves = tune()) |>
  set_mode("classification")

5 Build Grid for Tuning

Code
bt_bonsai_spec |> 
  extract_parameter_set_dials()
Collection of 3 parameters for tuning

 identifier       type    object
 learn_rate learn_rate nparam[+]
  stop_iter  stop_iter nparam[+]
 num_leaves num_leaves nparam[+]
Code
learn_rate()
Learning Rate (quantitative)
Transformer: log-10 [1e-100, Inf]
Range (transformed scale): [-10, -1]
Code
stop_iter()
# Iterations Before Stopping (quantitative)
Range: [3, 20]
Code
num_leaves()
Number of Leaves (quantitative)
Range: [5, 100]

6 Build Grid for Tuning

Code
grid_tune <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 50)
Code
grid_tune |> glimpse(width = 50)
Rows: 50
Columns: 3
$ learn_rate <dbl> 4.499336e-02, 3.174162e-05, 9…
$ stop_iter  <int> 7, 17, 11, 8, 8, 13, 5, 20, 9…
$ num_leaves <int> 81, 39, 34, 74, 22, 79, 46, 8…

7 Fit Models & Tune Hyperparameters

Construct our workflow

Code
bt_bonsai_wf <-
  workflow() |> 
  add_recipe(penguin_rec) |> 
  add_model(bt_bonsai_spec)

Specify the grid control parameters

Code
cntl   <- control_grid(save_pred     = TRUE,
                       save_workflow = TRUE)

7.1 Fit Models & Tune Hyperparameters

Code
start_time=Sys.time()
bt_tune_grid <- 
  bt_bonsai_wf |> 
  tune_grid(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = cntl
  )
end_time=Sys.time()
time=end_time-start_time
time
Time difference of 15.73666 secs

7.2 Tuning Results

Code
autoplot(bt_tune_grid)

8 Racing with {finetune}

Code
library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)

9 Racing with {finetune}

Code
library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)
bt_tune_race <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = race_cntl
  )

10 Racing Results

Code
autoplot(bt_tune_race)

11 Racing Results

Code
plot_race(bt_tune_race)

12 Faster tune with more cores

Code
big_grid <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 250)
Code
# tune in parallel

parallel::detectCores(logical = FALSE)
[1] 8
Code
library(doMC)
registerDoMC(cores =6)

start_time=Sys.time()
bt_tune_fast <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples  = penguin_folds,
    grid       = big_grid,
    control    = race_cntl
  )
end_time=Sys.time()
time=end_time-start_time
time
Time difference of 12.03298 secs
Code
autoplot(bt_tune_fast)

Code
plot_race(bt_tune_fast)

Code
bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

13

Code
bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

# extract the best model from the workflow
best_bt_race <-
  bt_tune_fast |>
  extract_workflow() |>
  finalize_workflow(bt_best_id) |>
  last_fit(penguin_split)
Code
# collect the metrics for the best model
best_bt_race |>
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.893 Preprocessor1_Model1
2 roc_auc  binary         0.973 Preprocessor1_Model1
Code
# plot results of test set fit
best_bt_race |>
  collect_predictions() |>
  roc_curve(sex, .pred_female) |>
  autoplot()

14 reference:

https://github.com/JamesHWade/r-mlops/blob/main/hyperparameter-tuning.qmd

https://www.youtube.com/watch?v=IzjmuGJgwKQ