Code
library(tidyverse)
library(ggplot2)
library(tidymodels)Tony Duan
October 12, 2023

from https://www.kaggle.com/c/titanic/data
Rows: 891
Columns: 12
$ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
$ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
[1] 891 13
# Bootstrap sampling
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [891/328]> Bootstrap01
2 <split [891/338]> Bootstrap02
3 <split [891/318]> Bootstrap03
4 <split [891/316]> Bootstrap04
5 <split [891/328]> Bootstrap05
6 <split [891/317]> Bootstrap06
7 <split [891/330]> Bootstrap07
8 <split [891/339]> Bootstrap08
9 <split [891/322]> Bootstrap09
10 <split [891/317]> Bootstrap10
# declare recipe
titanic_recipe <-
recipe(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = train_df) %>% # keep variables we want
step_impute_median(Age,Fare) %>% # imputation
step_impute_mode(Embarked) %>% # imputation
step_mutate_at( Pclass, Sex, Embarked, fn = factor) %>% # make these factors
step_mutate(Travelers = SibSp + Parch + 1) %>% # new variable
step_rm(SibSp, Parch) %>% # remove variables
step_dummy(all_nominal_predictors()) %>% # create indicator variables
# normalize numerical variables
step_normalize(all_numeric_predictors())
summary(titanic_recipe)# A tibble: 8 × 4
variable type role source
<chr> <list> <chr> <chr>
1 Pclass <chr [2]> predictor original
2 Sex <chr [3]> predictor original
3 Age <chr [2]> predictor original
4 SibSp <chr [2]> predictor original
5 Parch <chr [2]> predictor original
6 Fare <chr [2]> predictor original
7 Embarked <chr [3]> predictor original
8 Survived <chr [3]> outcome original
# logistic regression
titanic_glm_spec <-
logistic_reg() %>% # model
set_engine('glm') %>% # package to use
set_mode('classification') # choose one of two: classification vs regresson
# random forest
titanic_rf_spec <-
rand_forest(trees = 200) %>% # algorithm speicfic argument:200 trees
set_engine('ranger') %>%
set_mode('classification')
# svm
titanic_svm_spec <-
svm_rbf() %>% # rbf - radial based
set_engine('kernlab') %>%
set_mode('classification')# logistic regression
doParallel::registerDoParallel() # resample fitting is embarrasingly parrallel problem
titanic_glm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_glm_spec)
# random forest
doParallel::registerDoParallel()
titanic_rf_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
# svm
doParallel::registerDoParallel()
titanic_svm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_svm_spec) # A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.793 10 0.00286 Preprocessor1_Model1
2 roc_auc binary 0.842 10 0.00640 Preprocessor1_Model1
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.822 10 0.00433 Preprocessor1_Model1
2 roc_auc binary 0.864 10 0.00552 Preprocessor1_Model1
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.807 10 0.00458 Preprocessor1_Model1
2 roc_auc binary 0.829 10 0.00406 Preprocessor1_Model1
It seems that Random Forest is the winner with 82% accuracy and ROCAUC of 86.5. We use it as a final fit to the whole training data.
# A tibble: 6 × 2
PassengerId Survived
<dbl> <fct>
1 892 0
2 893 0
3 894 0
4 895 0
5 896 1
6 897 0
https://www.kaggle.com/c/titanic/data
https://rpubs.com/tsadigov/titanic_tidymodels
---
title: "tidymodels 2 with bootstrap resamples and workflow"
author: "Tony Duan"
date: "2023-10-12"
categories: [R]
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: left
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
{width="400"}
# package
```{r}
library(tidyverse)
library(ggplot2)
library(tidymodels)
```
# data
from https://www.kaggle.com/c/titanic/data
```{r}
pred <- c("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "title")
train_df_raw <- read_csv('data/train.csv')
test_df_raw <- read_csv('data/test.csv')
glimpse(train_df_raw)
```
## train data
```{r}
train_df=train_df_raw %>%
mutate(Survived=as.factor(Survived),
title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
)
#train_df=train_df %>% select(c(all_of(pred),"Survived"))
dim(train_df)
```
```{r}
train_df %>% count(Survived)
```
```{r}
342/(549+342)
```
## test data
```{r}
test_df=test_df_raw %>%
mutate(
title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
)
#test_df=test_df %>% select(c(all_of(pred)))
dim(test_df)
```
## bootstrap (re)samples for model selection
The last primary technique in rsample for creating resamples from the training data is bootstrap resampling. A “bootstrap sample” is a sample of your data set, the same size as your data set, taken with replacement so that a single observation might be sampled multiple times. The assessment set is then made up of all the observations that weren’t selected for the analysis set. Generally, bootstrap resampling produces pessimistic estimates of model accuracy.
create bootstrap resamples in rsample using the bootstraps() function. While you can’t control the proportion of data in each set – the assessment set of a bootstrap resample is always the same size as the training data
```{r}
set.seed(2022)
titanic_folds <- bootstraps(data = train_df,
times = 10)
titanic_folds
```
## other sampling method V-Fold Cross-Validation
```{r}
vfold_cv(data = train_df, v = 5)
```
## other sampling method Monte-Carlo Cross-Validation
```{r}
mc_cv(data = train_df, prop = 0.8, times = 2)
```
## other sampling method Validation Set
```{r}
three_way_split <-
rsample::initial_validation_split(data = train_df, prop = c(0.6, 0.3))
three_way_split
```
# model
## recipe
```{r}
# declare recipe
titanic_recipe <-
recipe(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = train_df) %>% # keep variables we want
step_impute_median(Age,Fare) %>% # imputation
step_impute_mode(Embarked) %>% # imputation
step_mutate_at( Pclass, Sex, Embarked, fn = factor) %>% # make these factors
step_mutate(Travelers = SibSp + Parch + 1) %>% # new variable
step_rm(SibSp, Parch) %>% # remove variables
step_dummy(all_nominal_predictors()) %>% # create indicator variables
# normalize numerical variables
step_normalize(all_numeric_predictors())
summary(titanic_recipe)
```
## model
```{r}
# logistic regression
titanic_glm_spec <-
logistic_reg() %>% # model
set_engine('glm') %>% # package to use
set_mode('classification') # choose one of two: classification vs regresson
# random forest
titanic_rf_spec <-
rand_forest(trees = 200) %>% # algorithm speicfic argument:200 trees
set_engine('ranger') %>%
set_mode('classification')
# svm
titanic_svm_spec <-
svm_rbf() %>% # rbf - radial based
set_engine('kernlab') %>%
set_mode('classification')
```
## workflow
```{r}
# logistic regression
doParallel::registerDoParallel() # resample fitting is embarrasingly parrallel problem
titanic_glm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_glm_spec)
# random forest
doParallel::registerDoParallel()
titanic_rf_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
# svm
doParallel::registerDoParallel()
titanic_svm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_svm_spec)
```
## trainning
```{r}
glm_model_fit=titanic_glm_wf%>% fit_resamples(titanic_folds)
rf_model_fit=titanic_rf_wf%>% fit_resamples(titanic_folds)
svm_model_fit=titanic_svm_wf%>% fit_resamples(titanic_folds)
```
# result
```{r}
collect_metrics(glm_model_fit)
```
```{r}
collect_metrics(rf_model_fit)
```
```{r}
collect_metrics(svm_model_fit)
```
## last fit
It seems that Random Forest is the winner with 82% accuracy and ROCAUC of 86.5. We use it as a final fit to the whole training data.
```{r}
#random forest workflow
titanic_rf_last_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
```
```{r}
# last fit
final_fit <-
fit(object = titanic_rf_last_wf,
data = train_df)
```
```{r}
#result
final_fit %>%
extract_recipe(estimated = T)
```
# predictions
```{r}
test_pred <-
final_fit %>%predict(test_df)
```
```{r}
final_result=test_pred %>% bind_cols(test_df) %>%
select(PassengerId, .pred_class) %>%
rename(Survived=.pred_class)
```
```{r}
head(final_result)
```
```{r}
final_result %>% count(Survived)
```
# Reference
https://www.kaggle.com/c/titanic/data
https://rpubs.com/tsadigov/titanic_tidymodels