[tidytuesday] Chocolate Ratings

data
Author

Tony Duan

Published

January 18, 2022

The data this week comes from Flavors of Cacao by way of Georgios and Kelsey.

Some analysis from 2017 on this data at

Kaggle

Code
library(tidyverse)
library(plotly)
library(lubridate)

1. download data

Code
library(tidytuesdayR)
tuesdata <- tidytuesdayR::tt_load('2022-01-18')

    Downloading file 1 of 1: `chocolate.csv`

2. read data

Code
chocolate <- tuesdata$chocolate

#chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

3. data Dictionary

2530 obs and 10 var

Code
library(tidyverse)
glimpse(chocolate)
Rows: 2,530
Columns: 10
$ ref                              <dbl> 2454, 2458, 2454, 2542, 2546, 2546, 2…
$ company_manufacturer             <chr> "5150", "5150", "5150", "5150", "5150…
$ company_location                 <chr> "U.S.A.", "U.S.A.", "U.S.A.", "U.S.A.…
$ review_date                      <dbl> 2019, 2019, 2019, 2021, 2021, 2021, 2…
$ country_of_bean_origin           <chr> "Tanzania", "Dominican Republic", "Ma…
$ specific_bean_origin_or_bar_name <chr> "Kokoa Kamili, batch 1", "Zorzal, bat…
$ cocoa_percent                    <chr> "76%", "76%", "76%", "68%", "72%", "8…
$ ingredients                      <chr> "3- B,S,C", "3- B,S,C", "3- B,S,C", "…
$ most_memorable_characteristics   <chr> "rich cocoa, fatty, bready", "cocoa, …
$ rating                           <dbl> 3.25, 3.50, 3.75, 3.00, 3.00, 3.25, 3…

4. data cleanning

remove all ‘.’ in company_location with str_replace_all function.replace . need to use “[.]” insteal of “.”

Code
chocolate=chocolate%>% mutate(company_location=str_replace_all(company_location,"[.]",""))

check na,ingredients have 87 na

Code
chocolate %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(list(~sum(is.na(.))))
# A tibble: 1 × 10
    ref company…¹ compa…² revie…³ count…⁴ speci…⁵ cocoa…⁶ ingre…⁷ most_…⁸ rating
  <int>     <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>  <int>
1     0         0       0       0       0       0       0      87       0      0
# … with abbreviated variable names ¹​company_manufacturer, ²​company_location,
#   ³​review_date, ⁴​country_of_bean_origin, ⁵​specific_bean_origin_or_bar_name,
#   ⁶​cocoa_percent, ⁷​ingredients, ⁸​most_memorable_characteristics

replace all na to unknown

Code
chocolate=chocolate%>% mutate(ingredients=replace_na(ingredients,"unknown"))

5.Visualization

Code
chocolate001=chocolate %>% group_by(review_date) %>% summarise(num=n())

# theme_bw
plot001=ggplot(data=chocolate001, aes(x=review_date, y=num)) +
  geom_line()+
  geom_point()+
  scale_x_continuous(breaks = scales::pretty_breaks(n = 20)) + 
  labs(
    title = "chocolate review number by year",
    subtitle = "from 2006 to 2021",
    caption = "data source: The Manhattan Chocolate Society",
  )+
  ylab("number of chocolate review")+
  xlab("Year")+
  theme_bw()

plot001

Code
chocolate001=chocolate %>% mutate(review_date = ymd(paste(review_date, 1, 1, sep="-")))


ggplot(chocolate001, aes(x = rating, fill = as.factor(review_date))) + 
  geom_density(alpha = .5) + 
  theme_minimal() +
  facet_wrap(~ as.factor(year(review_date))) + 
   xlim(0, 5)+
  guides(fill = FALSE) + labs(x = 'Rating', y = 'Density')

6. model

withn 4 clusters

Code
library(tidymodels)
library(tidyclust)
set.seed(1234)
Code
chocolate_new=chocolate %>% select(-specific_bean_origin_or_bar_name)
Code
kmeans_spec <- k_means(num_clusters = 4) %>%
  set_engine("ClusterR")
kmeans_spec
K Means Cluster Specification (partition)

Main Arguments:
  num_clusters = 4

Computational engine: ClusterR 
Code
rec_spec <- recipe(~ ., data = chocolate_new) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>% 
  step_pca(all_numeric_predictors(), threshold = 0.8)
Code
kmeans_wf <- workflow(rec_spec, kmeans_spec)
kmeans_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: k_means()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_dummy()
• step_zv()
• step_normalize()
• step_pca()

── Model ───────────────────────────────────────────────────────────────────────
K Means Cluster Specification (partition)

Main Arguments:
  num_clusters = 4

Computational engine: ClusterR 
Code
kmeans_fit <- fit(kmeans_wf, data = chocolate_new)
kmeans_fit
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: k_means()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_dummy()
• step_zv()
• step_normalize()
• step_pca()

── Model ───────────────────────────────────────────────────────────────────────
KMeans Cluster
 Call: ClusterR::KMeans_rcpp(data = data, clusters = clusters) 
 Data cols: 1843 
 Centroids: 4 
 BSS/SS: 0.003092468 
 SS: 6599959 = 6579549 (WSS) + 20410.16 (BSS)
Code
extract_cluster_assignment(kmeans_fit)
# A tibble: 2,530 × 1
   .cluster 
   <fct>    
 1 Cluster_1
 2 Cluster_1
 3 Cluster_1
 4 Cluster_1
 5 Cluster_1
 6 Cluster_1
 7 Cluster_1
 8 Cluster_1
 9 Cluster_1
10 Cluster_1
# … with 2,520 more rows
Code
extract_centroids(kmeans_fit)
# A tibble: 4 × 1,844
  .cluster PC0001 PC0002  PC0003  PC0004  PC0005  PC0006  PC0007  PC0008  PC0009
  <fct>     <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 Cluster…  0.728 -2.43  -0.836  -0.188   0.276  -1.11    0.739  -1.56   -0.0429
2 Cluster…  0.843 -2.96   0.157   0.127   0.132   0.127  -0.399   0.189   0.486 
3 Cluster…  1.49  -4.98  -0.835  -0.775   0.0760 -1.78    3.00   -0.994   0.415 
4 Cluster… -0.208  0.730 -0.0283 -0.0256 -0.0330 -0.0157  0.0772 -0.0305 -0.115 
# … with 1,834 more variables: PC0010 <dbl>, PC0011 <dbl>, PC0012 <dbl>,
#   PC0013 <dbl>, PC0014 <dbl>, PC0015 <dbl>, PC0016 <dbl>, PC0017 <dbl>,
#   PC0018 <dbl>, PC0019 <dbl>, PC0020 <dbl>, PC0021 <dbl>, PC0022 <dbl>,
#   PC0023 <dbl>, PC0024 <dbl>, PC0025 <dbl>, PC0026 <dbl>, PC0027 <dbl>,
#   PC0028 <dbl>, PC0029 <dbl>, PC0030 <dbl>, PC0031 <dbl>, PC0032 <dbl>,
#   PC0033 <dbl>, PC0034 <dbl>, PC0035 <dbl>, PC0036 <dbl>, PC0037 <dbl>,
#   PC0038 <dbl>, PC0039 <dbl>, PC0040 <dbl>, PC0041 <dbl>, PC0042 <dbl>, …
Code
predict(kmeans_fit, new_data = slice_sample(chocolate_new, n = 10))
# A tibble: 10 × 1
   .pred_cluster
   <fct>        
 1 Cluster_1    
 2 Cluster_1    
 3 Cluster_1    
 4 Cluster_1    
 5 Cluster_1    
 6 Cluster_1    
 7 Cluster_1    
 8 Cluster_2    
 9 Cluster_2    
10 Cluster_1    
Code
predict_group=predict(kmeans_fit, new_data = chocolate_new)
Code
chocolate_all=chocolate%>% mutate(predict=predict_group$'.pred_cluster')
Code
chocolate_all %>% group_by(predict) %>% summarise(num=n(),rate=mean(rating))
# A tibble: 4 × 3
  predict     num  rate
  <fct>     <int> <dbl>
1 Cluster_1  2032  3.21
2 Cluster_2   477  3.14
3 Cluster_3    13  3.31
4 Cluster_4     8  2.88
Code
ggplot(chocolate_all, aes(x = rating, fill = as.factor(predict))) + 
  geom_density(alpha = .5) + 
  theme_minimal() +
  facet_wrap(~ as.factor(predict)) + 
   xlim(0, 5)+
  guides(fill = FALSE) + labs(x = 'Rating', y = 'Density')

Reference

https://github.com/rfordatascience/tidytuesday/blob/master/data/2022/2022-01-18/readme.md