Code
library(tidymodels)
library(tidyverse)
# Helper packages
library(readr) # for importing data
library(vip) # for variable importance plots
collection of package for modeling
Tony Duan
July 12, 2023
The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles.
Rows: 50,000
Columns: 23
$ hotel <fct> City_Hotel, City_Hotel, Resort_Hotel, R…
$ lead_time <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6…
$ stays_in_weekend_nights <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, …
$ stays_in_week_nights <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, …
$ adults <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, …
$ children <fct> none, none, none, none, none, none, chi…
$ meal <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,…
$ country <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,…
$ market_segment <fct> Offline_TA/TO, Direct, Online_TA, Onlin…
$ distribution_channel <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA…
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, …
$ assigned_room_type <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, …
$ booking_changes <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type <fct> No_Deposit, No_Deposit, No_Deposit, No_…
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type <fct> Transient-Party, Transient, Transient, …
$ average_daily_rate <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.…
$ required_car_parking_spaces <fct> none, none, none, none, none, none, non…
$ total_of_special_requests <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, …
$ arrival_date <date> 2016-09-01, 2017-08-25, 2016-11-19, 20…
target: have chilren/no chilren
reserve 25% of the stays to the test set
# A tibble: 2 × 3
children n prop
<fct> <int> <dbl>
1 children 3027 0.0807
2 none 34473 0.919
# A tibble: 2 × 3
children n prop
<fct> <int> <dbl>
1 children 1011 0.0809
2 none 11489 0.919
holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter",
"ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")
lr_recipe <-
recipe(children ~ ., data = hotel_other) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date, holidays = holidays) %>%
step_rm(arrival_date) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
penalized logistic regression
# A tibble: 5 × 1
penalty
<dbl>
1 0.0001
2 0.000127
3 0.000161
4 0.000204
5 0.000259
# A tibble: 5 × 1
penalty
<dbl>
1 0.0386
2 0.0489
3 0.0621
4 0.0788
5 0.1
[1] "start run"
[1] "2023-07-14 02:31:47 CST"
[1] "finish run"
[1] "2023-07-14 02:31:51 CST"
[1] "total time"
Time difference of 4.422971 secs
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.000127 roc_auc binary 0.872 1 NA Preprocessor1_Model02
2 0.000161 roc_auc binary 0.872 1 NA Preprocessor1_Model03
3 0.000204 roc_auc binary 0.873 1 NA Preprocessor1_Model04
4 0.000259 roc_auc binary 0.873 1 NA Preprocessor1_Model05
5 0.000329 roc_auc binary 0.874 1 NA Preprocessor1_Model06
6 0.000418 roc_auc binary 0.874 1 NA Preprocessor1_Model07
7 0.000530 roc_auc binary 0.875 1 NA Preprocessor1_Model08
8 0.000672 roc_auc binary 0.875 1 NA Preprocessor1_Model09
9 0.000853 roc_auc binary 0.876 1 NA Preprocessor1_Model10
10 0.00108 roc_auc binary 0.876 1 NA Preprocessor1_Model11
11 0.00137 roc_auc binary 0.876 1 NA Preprocessor1_Model12
12 0.00174 roc_auc binary 0.876 1 NA Preprocessor1_Model13
13 0.00221 roc_auc binary 0.876 1 NA Preprocessor1_Model14
14 0.00281 roc_auc binary 0.875 1 NA Preprocessor1_Model15
15 0.00356 roc_auc binary 0.873 1 NA Preprocessor1_Model16
# A tibble: 1 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00137 roc_auc binary 0.876 1 NA Preprocessor1_Model12
https://www.tidymodels.org/start/case-study/
---
title: "R Package: [tideymodel] penalized logistic regression model"
subtitle: "collection of package for modeling"
author: "Tony Duan"
date: "2023-07-12"
categories: [packages]
execute:
warning: false
error: false
format:
html:
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles.
{width="416"}
```{r}
library(tidymodels)
library(tidyverse)
# Helper packages
library(readr) # for importing data
library(vip) # for variable importance plots
```
# input data
```{r}
hotels <-
read_csv("https://tidymodels.org/start/case-study/hotels.csv") %>%
mutate(across(where(is.character), as.factor))
```
```{r}
dim(hotels)
```
```{r}
glimpse(hotels)
```
target: have chilren/no chilren
```{r}
hotels %>%
count(children) %>%
mutate(prop = n/sum(n))
```
# DATA SPLITTING & RESAMPLING
reserve 25% of the stays to the test set
```{r}
set.seed(123)
splits <- initial_split(hotels, strata = children)
hotel_other <- training(splits)
hotel_test <- testing(splits)
```
```{r}
# training set proportions by children
hotel_other %>%
count(children) %>%
mutate(prop = n/sum(n))
```
```{r}
# test set proportions by children
hotel_test %>%
count(children) %>%
mutate(prop = n/sum(n))
```
```{r}
set.seed(234)
val_set <- validation_split(hotel_other,
strata = children,
prop = 0.80)
val_set
```
# recipe
```{r}
holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter",
"ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")
lr_recipe <-
recipe(children ~ ., data = hotel_other) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date, holidays = holidays) %>%
step_rm(arrival_date) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
```
# model
penalized logistic regression
```{r}
lr_mod <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
```
# workflow
```{r}
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
```
# tuning
```{r}
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5) # lowest penalty values
lr_reg_grid %>% top_n(5) # highest penalty values
```
# Train and tune the model
```{r}
print('start run')
a=Sys.time()
print(a)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
print('finish run')
b=Sys.time()
print(b)
print('total time')
print(b-a)
```
# result
```{r}
lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot
```
```{r}
top_models <-
lr_res %>%
show_best("roc_auc", n = 15) %>%
arrange(penalty)
top_models
```
```{r}
lr_best <-
lr_res %>%
collect_metrics() %>%
arrange(penalty) %>%
slice(12)
lr_best
```
```{r}
lr_auc <-
lr_res %>%
collect_predictions(parameters = lr_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Logistic Regression")
autoplot(lr_auc)
```
# output model
# Reference
https://www.tidymodels.org/start/case-study/