1 library

Code

library(tidyverse)
library(tidymodels)
library(discrim)

2 data input

Code

spam <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')

Code

glimpse(spam)

Rows: 4,601
Columns: 7
$ crl.tot <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 1257, 749, 21, 184, 26…
$ dollar  <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, 0.054, 0.000, 0.203,…
$ bang    <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, 0.164, 0.000, 0.181,…
$ money   <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.00, 0.…
$ n000    <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.19, 0.…
$ make    <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.06, 0.…
$ yesno   <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y…

3 stat

Code

spam %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))

# A tibble: 2 × 3
  yesno count percentage
  <chr> <int>      <dbl>
1 n      2788       0.61
2 y      1813       0.39

4 plotting

Code

spam |> 
  ggplot(aes(crl.tot, fill = yesno, color = yesno)) +
  geom_density(linewidth = 1.2, alpha = 0.2) +
  scale_x_log10() +
  labs(fill = "Spam?", color = "Spam?")

Code

spam |> 
  pivot_longer(dollar:make) |> 
  mutate(
    yesno = if_else(yesno == "n", "Not spam", "Spam"),
    value = if_else(value > 0, "Greater than zero", "Zero")
  ) |> 
  ggplot(aes(value, fill = yesno)) +
  geom_bar(alpha = 0.8) +
  facet_wrap(vars(name)) +
  theme(legend.position="bottom") +
  labs(fill = NULL, x = NULL)

5 model data

split data into spam_train and spam_test

Code

set.seed(123)
spam_split =
  spam  %>%  
  mutate(yesno = as.factor(yesno)) %>% 
  initial_split(strata = yesno)

spam_train = training(spam_split)
spam_test = testing(spam_split)

testing data

Code

spam_test %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))%>%
            bind_rows(summarise(., across(where(is.numeric), sum)
                                   ,across(where(is.character), ~'Total')
                                  ,across(where(is.factor), ~'Total')
                                )
                      )

# A tibble: 3 × 3
  yesno count percentage
  <chr> <int>      <dbl>
1 n       697       0.61
2 y       454       0.39
3 Total  1151       1

traing data:

Code

spam_train %>% group_by(yesno) %>% summarise(count=n()) %>% mutate(percentage=round(count/sum(count),2))%>%
            bind_rows(summarise(., across(where(is.numeric), sum)
                                   ,across(where(is.character), ~'Total')
                                  ,across(where(is.factor), ~'Total')
                                )
                      )

# A tibble: 3 × 3
  yesno count percentage
  <chr> <int>      <dbl>
1 n      2091       0.61
2 y      1359       0.39
3 Total  3450       1

Code

glimpse(spam_test)

Rows: 1,151
Columns: 7
$ crl.tot <dbl> 54, 49, 21, 25, 2259, 59, 264, 186, 898, 750, 66, 96, 121, 17,…
$ dollar  <dbl> 0.000, 0.000, 0.000, 0.000, 0.046, 0.000, 0.000, 0.000, 0.230,…
$ bang    <dbl> 0.000, 0.000, 0.462, 0.000, 0.250, 0.886, 0.000, 0.963, 0.278,…
$ money   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.…
$ n000    <dbl> 0.00, 0.00, 0.00, 0.00, 0.05, 0.00, 0.00, 0.00, 0.19, 0.19, 0.…
$ make    <dbl> 0.00, 0.00, 0.00, 0.00, 0.05, 1.17, 0.00, 0.00, 0.46, 0.06, 0.…
$ yesno   <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,…

10 fold cross-validation resamples

Code

set.seed(234)

spam_folds <- vfold_cv(spam_train, strata = yesno)
spam_folds

#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [3104/346]> Fold01
 2 <split [3105/345]> Fold02
 3 <split [3105/345]> Fold03
 4 <split [3105/345]> Fold04
 5 <split [3105/345]> Fold05
 6 <split [3105/345]> Fold06
 7 <split [3105/345]> Fold07
 8 <split [3105/345]> Fold08
 9 <split [3105/345]> Fold09
10 <split [3106/344]> Fold10

6 model

Code

#naive_Bayes model
nb_spec = naive_Bayes()
nb_spec_tune <- naive_Bayes(smoothness = tune())

#mars model
mars_spec <- mars() %>% 
  set_mode("classification")
mars_spec_tune <- mars(num_terms = tune()) %>% 
  set_mode("classification")

#random_forest model
rf_spec <- rand_forest(trees = 1e3) %>% 
  set_mode("classification")
rf_spec_tune <- rand_forest(trees = 1e3, mtry = tune(), min_n = tune()) |> 
  set_mode("classification")

7 workflow with 6 models

Code

spam_models <-
  workflow_set(
    preproc = list(formula = yesno ~ .),
    models = list(
      nb = nb_spec, 
      mars = mars_spec, 
      rf = rf_spec,
      nb_tune = nb_spec_tune, 
      mars_tune = mars_spec_tune, 
      rf_tune = rf_spec_tune
    )
  )

spam_models

# A workflow set/tibble: 6 × 4
  wflow_id          info             option    result    
  <chr>             <list>           <list>    <list>    
1 formula_nb        <tibble [1 × 4]> <opts[0]> <list [0]>
2 formula_mars      <tibble [1 × 4]> <opts[0]> <list [0]>
3 formula_rf        <tibble [1 × 4]> <opts[0]> <list [0]>
4 formula_nb_tune   <tibble [1 × 4]> <opts[0]> <list [0]>
5 formula_mars_tune <tibble [1 × 4]> <opts[0]> <list [0]>
6 formula_rf_tune   <tibble [1 × 4]> <opts[0]> <list [0]>

8 training

total correctness: Accuracy = (TN + TP)/(TN+TP+FN+FP) = (Number of correct assessments)/Number of all assessments)

true positive: Sensitivity = TP/(TP + FN) = (Number of true positive assessment)/(Number of all positive assessment)

true negative: Specificity = TN/(TN + FP) = (Number of true negative assessment)/(Number of all negative assessment)

Code

set.seed(123)
doParallel::registerDoParallel()

spam_res =
    spam_models  %>%  
    workflow_map(
        "tune_grid",
        resamples = spam_folds,
        metrics = metric_set(accuracy, sensitivity, specificity)
    )

9 result plot

Code

autoplot(spam_res)

formula_rf is best on accuracy and formula_rf_tune is best on specificity

Code

rank_results(spam_res, rank_metric = "accuracy")

# A tibble: 81 × 9
   wflow_id        .config  .metric  mean std_err     n preprocessor model  rank
   <chr>           <chr>    <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
 1 formula_rf      Preproc… accura… 0.881 0.00650    10 formula      rand…     1
 2 formula_rf      Preproc… sensit… 0.940 0.00605    10 formula      rand…     1
 3 formula_rf      Preproc… specif… 0.792 0.0110     10 formula      rand…     1
 4 formula_rf_tune Preproc… accura… 0.880 0.00650    10 formula      rand…     2
 5 formula_rf_tune Preproc… sensit… 0.934 0.00520    10 formula      rand…     2
 6 formula_rf_tune Preproc… specif… 0.798 0.0107     10 formula      rand…     2
 7 formula_rf_tune Preproc… accura… 0.880 0.00605    10 formula      rand…     3
 8 formula_rf_tune Preproc… sensit… 0.935 0.00523    10 formula      rand…     3
 9 formula_rf_tune Preproc… specif… 0.795 0.0106     10 formula      rand…     3
10 formula_rf_tune Preproc… accura… 0.880 0.00634    10 formula      rand…     4
# ℹ 71 more rows

10 Train with random forest and evaluate final model

Code

# random forest workflow
spam_wf <- workflow(
    yesno ~ ., 
    rf_spec |> set_engine("ranger", importance = "impurity")
)
spam_fit <- last_fit(spam_wf, spam_split)
spam_fit

# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits              id               .metrics .notes   .predictions .workflow 
  <list>              <chr>            <list>   <list>   <list>       <list>    
1 <split [3450/1151]> train/test split <tibble> <tibble> <tibble>     <workflow>

confusion matrix on test data:

Code

cmat=collect_predictions(spam_fit)  %>%  
    conf_mat(yesno, .pred_class)

cmat

          Truth
Prediction   n   y
         n 664 106
         y  33 348

Code

autoplot(cmat, type="heatmap")

Code

#total correctness:
#Accuracy = (TN + TP)/(TN+TP+FN+FP) = (Number of correct assessments)/Number of all assessments)
(664+348)/(664+106+33+348)

[1] 0.8792354

Code

#true positive:
#Sensitivity = TP/(TP + FN) = (Number of true positive assessment)/(Number of all positive assessment)
(664)/(664+33)

[1] 0.9526542

Code

#true negative:
#Specificity = TN/(TN + FP) = (Number of true negative assessment)/(Number of all negative assessment)
(348)/(106+348)

[1] 0.7665198

Code

collect_predictions(spam_fit)  %>%  
    roc_curve(yesno, .pred_n) %>% 
    autoplot()

bang is most important variable

Code

library(vip)
extract_workflow(spam_fit) |>
  extract_fit_parsnip() |>
  vip()

11 Create a deployable model object

model name “spam-email-rf”

Code

library(vetiver)

v <- extract_workflow(spam_fit) |> 
    vetiver_model("spam-email-rf")
v


── spam-email-rf ─ <bundled_workflow> model for deployment 
A ranger classification modeling workflow using 6 features

12 deploy model

Code

#library(plumber)
#pr() %>% 
#    vetiver_api(v) %>% 
#    pr_run()

13 Reference

https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15/readme.md

https://juliasilge.com/blog/spam-email/