Code
library(tidyverse)
library(tidymodels)
library(discrim)Tony Duan
September 1, 2023

Rows: 4,601
Columns: 7
$ crl.tot <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 1257, 749, 21, 184, 26…
$ dollar <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, 0.054, 0.000, 0.203,…
$ bang <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, 0.164, 0.000, 0.181,…
$ money <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.00, 0.…
$ n000 <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.19, 0.…
$ make <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.06, 0.…
$ yesno <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y…


spam |>
pivot_longer(dollar:make) |>
mutate(
yesno = if_else(yesno == "n", "Not spam", "Spam"),
value = if_else(value > 0, "Greater than zero", "Zero")
) |>
ggplot(aes(value, fill = yesno)) +
geom_bar(alpha = 0.8) +
facet_wrap(vars(name)) +
theme(legend.position="bottom") +
labs(fill = NULL, x = NULL)
split data into spam_train and spam_test
testing data
# A tibble: 3 × 3
yesno count percentage
<chr> <int> <dbl>
1 n 697 0.61
2 y 454 0.39
3 Total 1151 1
traing data:
# A tibble: 3 × 3
yesno count percentage
<chr> <int> <dbl>
1 n 2091 0.61
2 y 1359 0.39
3 Total 3450 1
Rows: 1,151
Columns: 7
$ crl.tot <dbl> 54, 49, 21, 25, 2259, 59, 264, 186, 898, 750, 66, 96, 121, 17,…
$ dollar <dbl> 0.000, 0.000, 0.000, 0.000, 0.046, 0.000, 0.000, 0.000, 0.230,…
$ bang <dbl> 0.000, 0.000, 0.462, 0.000, 0.250, 0.886, 0.000, 0.963, 0.278,…
$ money <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.…
$ n000 <dbl> 0.00, 0.00, 0.00, 0.00, 0.05, 0.00, 0.00, 0.00, 0.19, 0.19, 0.…
$ make <dbl> 0.00, 0.00, 0.00, 0.00, 0.05, 1.17, 0.00, 0.00, 0.46, 0.06, 0.…
$ yesno <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,…
10 fold cross-validation resamples
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [3104/346]> Fold01
2 <split [3105/345]> Fold02
3 <split [3105/345]> Fold03
4 <split [3105/345]> Fold04
5 <split [3105/345]> Fold05
6 <split [3105/345]> Fold06
7 <split [3105/345]> Fold07
8 <split [3105/345]> Fold08
9 <split [3105/345]> Fold09
10 <split [3106/344]> Fold10
#naive_Bayes model
nb_spec = naive_Bayes()
nb_spec_tune <- naive_Bayes(smoothness = tune())
#mars model
mars_spec <- mars() %>%
set_mode("classification")
mars_spec_tune <- mars(num_terms = tune()) %>%
set_mode("classification")
#random_forest model
rf_spec <- rand_forest(trees = 1e3) %>%
set_mode("classification")
rf_spec_tune <- rand_forest(trees = 1e3, mtry = tune(), min_n = tune()) |>
set_mode("classification")# A workflow set/tibble: 6 × 4
wflow_id info option result
<chr> <list> <list> <list>
1 formula_nb <tibble [1 × 4]> <opts[0]> <list [0]>
2 formula_mars <tibble [1 × 4]> <opts[0]> <list [0]>
3 formula_rf <tibble [1 × 4]> <opts[0]> <list [0]>
4 formula_nb_tune <tibble [1 × 4]> <opts[0]> <list [0]>
5 formula_mars_tune <tibble [1 × 4]> <opts[0]> <list [0]>
6 formula_rf_tune <tibble [1 × 4]> <opts[0]> <list [0]>
total correctness: Accuracy = (TN + TP)/(TN+TP+FN+FP) = (Number of correct assessments)/Number of all assessments)
true positive: Sensitivity = TP/(TP + FN) = (Number of true positive assessment)/(Number of all positive assessment)
true negative: Specificity = TN/(TN + FP) = (Number of true negative assessment)/(Number of all negative assessment)
formula_rf is best on accuracy and formula_rf_tune is best on specificity
# A tibble: 81 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 formula_rf Preproc… accura… 0.881 0.00650 10 formula rand… 1
2 formula_rf Preproc… sensit… 0.940 0.00605 10 formula rand… 1
3 formula_rf Preproc… specif… 0.792 0.0110 10 formula rand… 1
4 formula_rf_tune Preproc… accura… 0.880 0.00650 10 formula rand… 2
5 formula_rf_tune Preproc… sensit… 0.934 0.00520 10 formula rand… 2
6 formula_rf_tune Preproc… specif… 0.798 0.0107 10 formula rand… 2
7 formula_rf_tune Preproc… accura… 0.880 0.00605 10 formula rand… 3
8 formula_rf_tune Preproc… sensit… 0.935 0.00523 10 formula rand… 3
9 formula_rf_tune Preproc… specif… 0.795 0.0106 10 formula rand… 3
10 formula_rf_tune Preproc… accura… 0.880 0.00634 10 formula rand… 4
# ℹ 71 more rows
# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3450/1151]> train/test split <tibble> <tibble> <tibble> <workflow>
confusion matrix on test data:
Truth
Prediction n y
n 664 106
y 33 348
[1] 0.8792354
[1] 0.9526542
[1] 0.7665198
bang is most important variable
model name “spam-email-rf”
https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15/readme.md
https://juliasilge.com/blog/spam-email/
---
title: "Evaluate multiple modeling"
author: "Tony Duan"
date: "2023-09-01"
categories: [analysis]
execute:
warning: false
error: false
format:
html:
toc: true
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
{width="600"}
# library
```{r}
library(tidyverse)
library(tidymodels)
library(discrim)
```
# data input
```{r}
spam <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')
```
```{r}
glimpse(spam)
```
{width="563"}
# stat
```{r}
spam %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))
```
# plotting
```{r}
spam |>
ggplot(aes(crl.tot, fill = yesno, color = yesno)) +
geom_density(linewidth = 1.2, alpha = 0.2) +
scale_x_log10() +
labs(fill = "Spam?", color = "Spam?")
```
```{r}
spam |>
pivot_longer(dollar:make) |>
mutate(
yesno = if_else(yesno == "n", "Not spam", "Spam"),
value = if_else(value > 0, "Greater than zero", "Zero")
) |>
ggplot(aes(value, fill = yesno)) +
geom_bar(alpha = 0.8) +
facet_wrap(vars(name)) +
theme(legend.position="bottom") +
labs(fill = NULL, x = NULL)
```
# model data
split data into spam_train and spam_test
```{r}
set.seed(123)
spam_split =
spam %>%
mutate(yesno = as.factor(yesno)) %>%
initial_split(strata = yesno)
spam_train = training(spam_split)
spam_test = testing(spam_split)
```
testing data
```{r}
spam_test %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))%>%
bind_rows(summarise(., across(where(is.numeric), sum)
,across(where(is.character), ~'Total')
,across(where(is.factor), ~'Total')
)
)
```
traing data:
```{r}
spam_train %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))%>%
bind_rows(summarise(., across(where(is.numeric), sum)
,across(where(is.character), ~'Total')
,across(where(is.factor), ~'Total')
)
)
```
```{r}
glimpse(spam_test)
```
10 fold cross-validation resamples
```{r}
set.seed(234)
spam_folds <- vfold_cv(spam_train, strata = yesno)
spam_folds
```
# model
## model
```{r}
#naive_Bayes model
nb_spec = naive_Bayes()
nb_spec_tune <- naive_Bayes(smoothness = tune())
#mars model
mars_spec <- mars() %>%
set_mode("classification")
mars_spec_tune <- mars(num_terms = tune()) %>%
set_mode("classification")
#random_forest model
rf_spec <- rand_forest(trees = 1e3) %>%
set_mode("classification")
rf_spec_tune <- rand_forest(trees = 1e3, mtry = tune(), min_n = tune()) |>
set_mode("classification")
```
## workflow with 6 models
```{r}
spam_models <-
workflow_set(
preproc = list(formula = yesno ~ .),
models = list(
nb = nb_spec,
mars = mars_spec,
rf = rf_spec,
nb_tune = nb_spec_tune,
mars_tune = mars_spec_tune,
rf_tune = rf_spec_tune
)
)
spam_models
```
# training
## training naive bayes by with train data
```{r}
spam_rec =recipe(yesno ~ ., data = spam_train)
spam_wflow <-
workflow() %>%
add_model(nb_spec) %>%
add_recipe(spam_rec)
spam_nb_fit= spam_wflow %>% fit(data = spam_train)
```
```{r}
flights_test_pred <- augment(spam_nb_fit, spam_test)
```
```{r}
flights_test_pred %>%
roc_curve(truth = yesno, .pred_n) %>%
autoplot()
```
```{r}
flights_test_pred %>%conf_mat(yesno, .pred_class)
(679+208)/(679+208+18+246)
```
## training with resamples
total correctness: Accuracy = (TN + TP)/(TN+TP+FN+FP) = (Number of correct assessments)/Number of all assessments)
true positive: Sensitivity = TP/(TP + FN) = (Number of true positive assessment)/(Number of all positive assessment)
true negative: Specificity = TN/(TN + FP) = (Number of true negative assessment)/(Number of all negative assessment)
```{r}
set.seed(123)
doParallel::registerDoParallel()
spam_res =
spam_models %>%
workflow_map(
"tune_grid",
resamples = spam_folds,
metrics = metric_set(accuracy, sensitivity, specificity)
)
```
# result plot
```{r}
autoplot(spam_res)
```
formula_rf is best on accuracy and formula_rf_tune is best on specificity
```{r}
rank_results(spam_res, rank_metric = "accuracy")
```
# Train with random forest and evaluate final model
```{r}
# random forest workflow
spam_wf <- workflow(
yesno ~ .,
rf_spec |> set_engine("ranger", importance = "impurity")
)
spam_fit <- last_fit(spam_wf, spam_split)
spam_fit
```
randowm forest confusion matrix on test data:
```{r}
cmat=collect_predictions(spam_fit) %>%
conf_mat(yesno, .pred_class)
cmat
```
```{r}
autoplot(cmat, type="heatmap")
```
```{r}
#total correctness:
#Accuracy = (TN + TP)/(TN+TP+FN+FP) = (Number of correct assessments)/Number of all assessments)
(664+348)/(664+106+33+348)
#true positive:
#Sensitivity = TP/(TP + FN) = (Number of true positive assessment)/(Number of all positive assessment)
(664)/(664+33)
#true negative:
#Specificity = TN/(TN + FP) = (Number of true negative assessment)/(Number of all negative assessment)
(348)/(106+348)
```
```{r}
collect_predictions(spam_fit) %>%
roc_curve(yesno, .pred_n) %>%
autoplot()
```
bang is most important variable
```{r}
library(vip)
extract_workflow(spam_fit) |>
extract_fit_parsnip() |>
vip()
```
# Create a deployable model object
model name "spam-email-rf"
```{r}
library(vetiver)
v <- extract_workflow(spam_fit) |>
vetiver_model("spam-email-rf")
v
```
# deploy model
```{r}
#library(plumber)
#pr() %>%
# vetiver_api(v) %>%
# pr_run()
```
# Reference
https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15/readme.md
https://juliasilge.com/blog/spam-email/