1 load package

Load Pacakges & Set Options

library(tidyverse)      
library(tidymodels)     
library(palmerpenguins) # penguin dataset
library(gt)             # better tables
library(bonsai)         # tree-based models
library(conflicted)     # function conflicts
tidymodels_prefer()     # handle conflicts
conflict_prefer("penguins", "palmerpenguins")

2 data clean and split

Code

penguins |>
  filter(!is.na(sex)) |>
  ggplot(aes(x     = flipper_length_mm,
             y     = bill_length_mm,
             color = sex,
             size  = body_mass_g)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~species)

Prepare & Split Data

# remove rows with missing sex
# exclude year and island
penguins_df <-
  penguins |>
  drop_na(sex) |>
  select(-year, -island)

# set the seed for reproducibility
set.seed(1234)

# Split the data into train and test sets
# stratify by sex
penguin_split <- initial_split(penguins_df,
                               strata = sex)
penguin_train <- training(penguin_split)
penguin_test  <- testing(penguin_split)

# create folds for cross validation
penguin_folds <- vfold_cv(penguin_train, v = 10, strata = sex)

3 Create Recipe

Code

penguin_rec <-
  recipe(sex ~ ., data = penguin_train) |>    
  step_dummy(species)

4 Specify Model

Code

rlang::check_installed("lightgbm")
bt_bonsai_spec <-
  boost_tree(learn_rate     = tune(),
             stop_iter      = tune(),
             trees          = 100) |>
  set_engine(engine     = "lightgbm",
             num_leaves = tune()) |>
  set_mode("classification")

5 Build Grid for Tuning

Code

bt_bonsai_spec |> 
  extract_parameter_set_dials()

Collection of 3 parameters for tuning

 identifier       type    object
 learn_rate learn_rate nparam[+]
  stop_iter  stop_iter nparam[+]
 num_leaves num_leaves nparam[+]

Code

learn_rate()

Learning Rate (quantitative)
Transformer: log-10 [1e-100, Inf]
Range (transformed scale): [-10, -1]

Code

stop_iter()

# Iterations Before Stopping (quantitative)
Range: [3, 20]

Code

num_leaves()

Number of Leaves (quantitative)
Range: [5, 100]

6 Build Grid for Tuning

Code

grid_tune <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 50)

Code

grid_tune |> glimpse(width = 50)

Rows: 50
Columns: 3
$ learn_rate <dbl> 4.499336e-02, 3.174162e-05, 9…
$ stop_iter  <int> 7, 17, 11, 8, 8, 13, 5, 20, 9…
$ num_leaves <int> 81, 39, 34, 74, 22, 79, 46, 8…

7 Fit Models & Tune Hyperparameters

Construct our workflow

Code

bt_bonsai_wf <-
  workflow() |> 
  add_recipe(penguin_rec) |> 
  add_model(bt_bonsai_spec)

Specify the grid control parameters

Code

cntl   <- control_grid(save_pred     = TRUE,
                       save_workflow = TRUE)

7.1 Fit Models & Tune Hyperparameters

Code

start_time=Sys.time()
bt_tune_grid <- 
  bt_bonsai_wf |> 
  tune_grid(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = cntl
  )
end_time=Sys.time()
time=end_time-start_time
time

Time difference of 15.73666 secs

7.2 Tuning Results

Code

autoplot(bt_tune_grid)

8 Racing with `{finetune}`

Code

library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)

9 Racing with `{finetune}`

Code

library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)
bt_tune_race <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = race_cntl
  )

10 Racing Results

Code

autoplot(bt_tune_race)

11 Racing Results

Code

plot_race(bt_tune_race)

12 Faster tune with more cores

Code

big_grid <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 250)

Code

# tune in parallel

parallel::detectCores(logical = FALSE)

[1] 8

Code

library(doMC)
registerDoMC(cores =6)

start_time=Sys.time()
bt_tune_fast <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples  = penguin_folds,
    grid       = big_grid,
    control    = race_cntl
  )
end_time=Sys.time()
time=end_time-start_time
time

Time difference of 12.03298 secs

Code

autoplot(bt_tune_fast)

Code

plot_race(bt_tune_fast)

Code

bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

13

Code

bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

# extract the best model from the workflow
best_bt_race <-
  bt_tune_fast |>
  extract_workflow() |>
  finalize_workflow(bt_best_id) |>
  last_fit(penguin_split)

Code

# collect the metrics for the best model
best_bt_race |>
  collect_metrics()

# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.893 Preprocessor1_Model1
2 roc_auc  binary         0.973 Preprocessor1_Model1

Code

# plot results of test set fit
best_bt_race |>
  collect_predictions() |>
  roc_curve(sex, .pred_female) |>
  autoplot()

14 reference:

https://github.com/JamesHWade/r-mlops/blob/main/hyperparameter-tuning.qmd

https://www.youtube.com/watch?v=IzjmuGJgwKQ

--- title: "Hyperparameter Tuning in R" subtitle: "prediction for binary model" author: "Tony Duan" date: "2024-03-10" categories: [R,model] execute: warning: false error: false format: html: toc: true toc-location: left code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9" image: "images/0.png" --- # load package ```{r} #| code-fold: true #| code-summary: "Load Pacakges & Set Options" library(tidyverse) library(tidymodels) library(palmerpenguins) # penguin dataset library(gt) # better tables library(bonsai) # tree-based models library(conflicted) # function conflicts tidymodels_prefer() # handle conflicts conflict_prefer("penguins", "palmerpenguins") ``` # data clean and split ```{r} penguins |> filter(!is.na(sex)) |> ggplot(aes(x = flipper_length_mm, y = bill_length_mm, color = sex, size = body_mass_g)) + geom_point(alpha = 0.5) + facet_wrap(~species) ``` ```{r} #| code-fold: true #| code-summary: "Prepare & Split Data" # remove rows with missing sex # exclude year and island penguins_df <- penguins |> drop_na(sex) |> select(-year, -island) # set the seed for reproducibility set.seed(1234) # Split the data into train and test sets # stratify by sex penguin_split <- initial_split(penguins_df, strata = sex) penguin_train <- training(penguin_split) penguin_test <- testing(penguin_split) # create folds for cross validation penguin_folds <- vfold_cv(penguin_train, v = 10, strata = sex) ``` # Create Recipe ```{r} penguin_rec <- recipe(sex ~ ., data = penguin_train) |> step_dummy(species) ``` # Specify Model ```{r} #| code-line-numbers: "|3,4,7" rlang::check_installed("lightgbm") bt_bonsai_spec <- boost_tree(learn_rate = tune(), stop_iter = tune(), trees = 100) |> set_engine(engine = "lightgbm", num_leaves = tune()) |> set_mode("classification") ``` # Build Grid for Tuning {auto-animate="true"} ```{r} bt_bonsai_spec |> extract_parameter_set_dials() ``` ```{r} learn_rate() ``` ```{r} stop_iter() ``` ```{r} num_leaves() ``` # Build Grid for Tuning {auto-animate="true"} ```{r} grid_tune <- bt_bonsai_spec |> extract_parameter_set_dials() |> grid_latin_hypercube(size = 50) ``` ```{r} grid_tune |> glimpse(width = 50) ``` # Fit Models & Tune Hyperparameters {auto-animate="true"} Construct our workflow ```{r} bt_bonsai_wf <- workflow() |> add_recipe(penguin_rec) |> add_model(bt_bonsai_spec) ``` Specify the grid control parameters ```{r} cntl <- control_grid(save_pred = TRUE, save_workflow = TRUE) ``` ## Fit Models & Tune Hyperparameters {auto-animate="true"} ```{r} start_time=Sys.time() bt_tune_grid <- bt_bonsai_wf |> tune_grid( resamples = penguin_folds, grid = grid_tune, control = cntl ) end_time=Sys.time() time=end_time-start_time time ``` ## Tuning Results ```{r} autoplot(bt_tune_grid) ``` # Racing with `{finetune}` {auto-animate="true"} ```{r} library(finetune) race_cntl <- control_race(save_pred = TRUE, save_workflow = TRUE) ``` # Racing with `{finetune}` {auto-animate="true"} ```{r} library(finetune) race_cntl <- control_race(save_pred = TRUE, save_workflow = TRUE) bt_tune_race <- bt_bonsai_wf |> tune_race_anova( resamples = penguin_folds, grid = grid_tune, control = race_cntl ) ``` # Racing Results ```{r} autoplot(bt_tune_race) ``` # Racing Results ```{r} plot_race(bt_tune_race) ``` # Faster tune with more cores ```{r} big_grid <- bt_bonsai_spec |> extract_parameter_set_dials() |> grid_latin_hypercube(size = 250) ``` ```{r} # tune in parallel parallel::detectCores(logical = FALSE) library(doMC) registerDoMC(cores =6) start_time=Sys.time() bt_tune_fast <- bt_bonsai_wf |> tune_race_anova( resamples = penguin_folds, grid = big_grid, control = race_cntl ) end_time=Sys.time() time=end_time-start_time time ``` ```{r} autoplot(bt_tune_fast) ``` ```{r} plot_race(bt_tune_fast) ``` ```{r} bt_best_id <- bt_tune_fast |> select_best(metric = "roc_auc") ``` # ```{r} bt_best_id <- bt_tune_fast |> select_best(metric = "roc_auc") # extract the best model from the workflow best_bt_race <- bt_tune_fast |> extract_workflow() |> finalize_workflow(bt_best_id) |> last_fit(penguin_split) ``` ```{r} # collect the metrics for the best model best_bt_race |> collect_metrics() ``` ```{r} # plot results of test set fit best_bt_race |> collect_predictions() |> roc_curve(sex, .pred_female) |> autoplot() ``` # reference: https://github.com/JamesHWade/r-mlops/blob/main/hyperparameter-tuning.qmd https://www.youtube.com/watch?v=IzjmuGJgwKQ

1 load package

2 data clean and split

3 Create Recipe

4 Specify Model

5 Build Grid for Tuning

6 Build Grid for Tuning

7 Fit Models & Tune Hyperparameters

7.1 Fit Models & Tune Hyperparameters

7.2 Tuning Results

8 Racing with {finetune}

9 Racing with {finetune}

10 Racing Results

11 Racing Results

12 Faster tune with more cores

13

14 reference:

8 Racing with `{finetune}`

9 Racing with `{finetune}`