R Package: [tideymodel] penalized logistic regression model

collection of package for modeling

packages
Author

Tony Duan

Published

July 12, 2023

The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles.

Code
library(tidymodels)  
library(tidyverse)  
# Helper packages
library(readr)       # for importing data
library(vip)         # for variable importance plots

1 input data

Code
hotels <- 
  read_csv("https://tidymodels.org/start/case-study/hotels.csv") %>%
  mutate(across(where(is.character), as.factor))
Code
dim(hotels)
[1] 50000    23
Code
glimpse(hotels)
Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, R…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6…
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, …
$ children                       <fct> none, none, none, none, none, none, chi…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Onlin…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <fct> Transient-Party, Transient, Transient, …
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, non…
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 20…

target: have chilren/no chilren

Code
hotels %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919 

2 DATA SPLITTING & RESAMPLING

reserve 25% of the stays to the test set

Code
set.seed(123)
splits      <- initial_split(hotels, strata = children)

hotel_other <- training(splits)
hotel_test  <- testing(splits)
Code
# training set proportions by children
hotel_other %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919 
Code
# test set proportions by children
hotel_test  %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919 
Code
set.seed(234)
val_set <- validation_split(hotel_other, 
                            strata = children, 
                            prop = 0.80)
val_set
# Validation Set Split (0.8/0.2)  using stratification 
# A tibble: 1 × 2
  splits               id        
  <list>               <chr>     
1 <split [30000/7500]> validation

3 recipe

Code
holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe <- 
  recipe(children ~ ., data = hotel_other) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())

4 model

penalized logistic regression

Code
lr_mod <- 
  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

5 workflow

Code
lr_workflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(lr_recipe)

6 tuning

Code
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))

lr_reg_grid %>% top_n(-5) # lowest penalty values
# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259
Code
lr_reg_grid %>% top_n(5)  # highest penalty values
# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1   

7 Train and tune the model

Code
print('start run')
[1] "start run"
Code
a=Sys.time()
print(a)
[1] "2023-07-14 02:31:47 CST"
Code
lr_res <- 
  lr_workflow %>% 
  tune_grid(val_set,
            grid = lr_reg_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))

print('finish run')
[1] "finish run"
Code
b=Sys.time()
print(b)
[1] "2023-07-14 02:31:51 CST"
Code
print('total time')
[1] "total time"
Code
print(b-a)
Time difference of 4.422971 secs

8 result

Code
lr_plot <- 
  lr_res %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() + 
  ylab("Area under the ROC Curve") +
  scale_x_log10(labels = scales::label_number())

lr_plot 

Code
top_models <-
  lr_res %>% 
  show_best("roc_auc", n = 15) %>% 
  arrange(penalty) 
top_models
# A tibble: 15 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.000127 roc_auc binary     0.872     1      NA Preprocessor1_Model02
 2 0.000161 roc_auc binary     0.872     1      NA Preprocessor1_Model03
 3 0.000204 roc_auc binary     0.873     1      NA Preprocessor1_Model04
 4 0.000259 roc_auc binary     0.873     1      NA Preprocessor1_Model05
 5 0.000329 roc_auc binary     0.874     1      NA Preprocessor1_Model06
 6 0.000418 roc_auc binary     0.874     1      NA Preprocessor1_Model07
 7 0.000530 roc_auc binary     0.875     1      NA Preprocessor1_Model08
 8 0.000672 roc_auc binary     0.875     1      NA Preprocessor1_Model09
 9 0.000853 roc_auc binary     0.876     1      NA Preprocessor1_Model10
10 0.00108  roc_auc binary     0.876     1      NA Preprocessor1_Model11
11 0.00137  roc_auc binary     0.876     1      NA Preprocessor1_Model12
12 0.00174  roc_auc binary     0.876     1      NA Preprocessor1_Model13
13 0.00221  roc_auc binary     0.876     1      NA Preprocessor1_Model14
14 0.00281  roc_auc binary     0.875     1      NA Preprocessor1_Model15
15 0.00356  roc_auc binary     0.873     1      NA Preprocessor1_Model16
Code
lr_best <- 
  lr_res %>% 
  collect_metrics() %>% 
  arrange(penalty) %>% 
  slice(12)
lr_best
# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1 0.00137 roc_auc binary     0.876     1      NA Preprocessor1_Model12
Code
lr_auc <- 
  lr_res %>% 
  collect_predictions(parameters = lr_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Logistic Regression")

autoplot(lr_auc)

9 output model

10 Reference

https://www.tidymodels.org/start/case-study/