1 load package

Load Pacakges & Set Options

library(tidyverse)      
library(tidymodels)     
library(palmerpenguins) # penguin dataset
library(gt)             # better tables
library(bonsai)         # tree-based models
library(conflicted)     # function conflicts
library(vetiver)
library(Microsoft365R)
library(pins)
tidymodels_prefer()     # handle conflicts
conflict_prefer("penguins", "palmerpenguins")
options(tidymodels.dark = TRUE) # dark mode
theme_set(theme_bw()) # set default ggplot2 theme

2 ploting

Code

penguins  %>% 
  filter(!is.na(sex)) %>% 
  ggplot(aes(x     = flipper_length_mm,
             y     = bill_length_mm,
             color = sex,
             size  = body_mass_g)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~species)

3 data split

Prepare & Split Data

# remove rows with missing sex
# exclude year and island
all_penguins_df =
  penguins %>% 
  drop_na(sex) %>% 
  select(-year, -island)

# hold out 10 for final testing

hold_penguins_df=head(all_penguins_df,10)

penguins_df=tail(all_penguins_df, -10)
  
# set the seed for reproducibility
set.seed(1234)

# Split the data into train and test sets
# stratify by sex
penguin_split = initial_split(penguins_df,
                               strata = sex)
penguin_train = training(penguin_split)
penguin_test  = testing(penguin_split)

# create folds for cross validation
penguin_folds = vfold_cv(penguin_train, v = 10, strata = sex)

Code

dim(penguin_train)

[1] 242   6

Code

dim(penguin_test)

[1] 81  6

4 Create Recipe

predict sex,two category target.

Code

penguin_rec =
1  recipe(sex ~ ., data = penguin_train) %>%
2  step_YeoJohnson(all_numeric_predictors())  %>%
3  step_dummy(species) %>%
4  step_normalize(all_numeric_predictors())

1: Define the recipe on the training data with sex as the target and all other vars as predictors
2: Apply Yeo-Johnson transformation to all numeric predictors for skewness
3: Create dummy variables for nominal variable species
4: Normalize all numeric predictors

5 Specify Model

5.1 Logistic Regression

Code

glm_spec =
  logistic_reg(penalty = 1)  %>% 
  set_engine("glm")

5.2 Random Forest

Code

tree_spec =
  rand_forest(min_n = tune())  %>% 
  set_engine("ranger")  %>% 
  set_mode("classification")

5.3 Neural Network with `{torch}`

Code

mlp_brulee_spec =
  mlp(
    hidden_units = tune(),
    epochs       = tune(),
    penalty      = tune(),
    learn_rate   = tune()
  ) %>% 
  set_engine("brulee") %>% 
  set_mode("classification")

6 Fit Models & Tune Hyperparameters

Use Bayes optimizaiton for hyperparameter tuning

Code

bayes_control = control_bayes(no_improve = 5L,
                               time_limit = 5,# 10 mins
                               save_pred  = TRUE,
                               verbose    = TRUE)

7 Fit Models & Tune Hyperparameters

Use {workflowsets} to fit all three model types with hyperparameter optimization for random forest and neural net models.

Code

workflow_set =
  workflow_set(
    preproc = list(penguin_rec),
    models  = list(glm   = glm_spec,
                   tree  = tree_spec,
                   torch = mlp_brulee_spec)
  ) %>% 
  workflow_map("tune_bayes",
               iter      = 5L,# 5 iteration
               resamples = penguin_folds,
               control   = bayes_control
  )

8 Compare Model Results

Tabular view

Code

# create table of best models defined using roc_auc metric
rank_results(workflow_set,
             rank_metric = "roc_auc",
             select_best = TRUE) %>% 
  gt()

wflow_id	.config	.metric	mean	std_err	n	preprocessor	model	rank
recipe_torch	Preprocessor1_Model2	accuracy	0.9173333	0.029470324	5	recipe	mlp	1
recipe_torch	Preprocessor1_Model2	roc_auc	0.9869124	0.008411554	5	recipe	mlp	1
recipe_tree	Iter1	accuracy	0.9046667	0.012621753	10	recipe	rand_forest	2
recipe_tree	Iter1	roc_auc	0.9712073	0.007699003	10	recipe	rand_forest	2
recipe_glm	Preprocessor1_Model1	accuracy	0.8966667	0.015513833	10	recipe	logistic_reg	3
recipe_glm	Preprocessor1_Model1	roc_auc	0.9634081	0.013098020	10	recipe	logistic_reg	3

Code

workflow_set |> autoplot()

best model

Code

best_model_id <- "recipe_glm"

9 Finalize Fit

Code

# select best model
best_fit =
  workflow_set %>% 
  extract_workflow_set_result(best_model_id) %>% 
  select_best(metric = "accuracy")

# create workflow for best model
final_workflow =
  workflow_set %>% 
  extract_workflow(best_model_id) %>% 
  finalize_workflow(best_fit)

# fit final model with all data
final_fit =
  final_workflow %>% 
  last_fit(penguin_split)

9.1 Final Fit Metrics

Code

# show model performance
final_fit %>% 
  collect_metrics() %>% 
  gt()

.metric	.estimator	.estimate	.config
accuracy	binary	0.9012346	Preprocessor1_Model1
roc_auc	binary	0.9762195	Preprocessor1_Model1

9.2 Final Fit Metrics

Code

final_fit  %>% 
  collect_predictions() %>% 
  roc_curve(sex, .pred_female) %>% 
  autoplot()

10 Create Vetiver Model & API

Create a vetiver model from final fit and upload to Microsoft one drive

Code

final_fit_to_deploy <- final_fit |> extract_workflow()

v <- vetiver_model(final_fit_to_deploy, model_name = "penguins_model",versioned = TRUE)

Code

# sign in one drive
od <- Microsoft365R::get_personal_onedrive()

# od$list_items()
board365 <- board_ms365(od, "Documents")

board365 %>% vetiver_pin_write(v)

11 download model from Microsoft one drive

Code

final_fit_to_deploy=vetiver_pin_read(board365,"penguins_model")

12 prediction

Code

glimpse(hold_penguins_df)

Rows: 10
Columns: 6
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…

Code

new_prediction=final_fit_to_deploy %>% predict(hold_penguins_df %>%select(-sex))

Code

glimpse(new_prediction)

Rows: 10
Columns: 1
$ .pred_class <fct> male, female, female, female, male, female, male, female, …

Code

all_penguin_test=cbind(hold_penguins_df,new_prediction)

Code

glimpse(all_penguin_test)

Rows: 10
Columns: 7
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…
$ .pred_class       <fct> male, female, female, female, male, female, male, fe…

Code

all_penguin_test %>% group_by(sex,".pred_class") %>% count()

# A tibble: 2 × 3
# Groups:   sex, ".pred_class" [2]
  sex    `".pred_class"`     n
  <fct>  <chr>           <int>
1 female .pred_class         5
2 male   .pred_class         5

13 reference:

https://github.com/JamesHWade/r-mlops/blob/main/the-whole-game.qmd

https://www.youtube.com/watch?v=J32pRt1nuoY

1 load package

2 ploting

3 data split

4 Create Recipe

5 Specify Model

5.1 Logistic Regression

5.2 Random Forest

5.3 Neural Network with {torch}

6 Fit Models & Tune Hyperparameters

7 Fit Models & Tune Hyperparameters

8 Compare Model Results

9 Finalize Fit

9.1 Final Fit Metrics

9.2 Final Fit Metrics

10 Create Vetiver Model & API

11 download model from Microsoft one drive

12 prediction

13 reference:

5.3 Neural Network with `{torch}`