tidystep - [tidytuesday] Chocolate Ratings

The data this week comes from Flavors of Cacao by way of Georgios and Kelsey.

Some analysis from 2017 on this data at

Code

library(tidyverse)
library(plotly)
library(lubridate)

1. download data

Code

library(tidytuesdayR)
tuesdata <- tidytuesdayR::tt_load('2022-01-18')


    Downloading file 1 of 1: `chocolate.csv`

2. read data

Code

chocolate <- tuesdata$chocolate

#chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

3. data Dictionary

2530 obs and 10 var

Code

library(tidyverse)
glimpse(chocolate)

Rows: 2,530
Columns: 10
$ ref                              <dbl> 2454, 2458, 2454, 2542, 2546, 2546, 2…
$ company_manufacturer             <chr> "5150", "5150", "5150", "5150", "5150…
$ company_location                 <chr> "U.S.A.", "U.S.A.", "U.S.A.", "U.S.A.…
$ review_date                      <dbl> 2019, 2019, 2019, 2021, 2021, 2021, 2…
$ country_of_bean_origin           <chr> "Tanzania", "Dominican Republic", "Ma…
$ specific_bean_origin_or_bar_name <chr> "Kokoa Kamili, batch 1", "Zorzal, bat…
$ cocoa_percent                    <chr> "76%", "76%", "76%", "68%", "72%", "8…
$ ingredients                      <chr> "3- B,S,C", "3- B,S,C", "3- B,S,C", "…
$ most_memorable_characteristics   <chr> "rich cocoa, fatty, bready", "cocoa, …
$ rating                           <dbl> 3.25, 3.50, 3.75, 3.00, 3.00, 3.25, 3…

4. data cleanning

remove all ‘.’ in company_location with str_replace_all function.replace . need to use “[.]” insteal of “.”

Code

chocolate=chocolate%>% mutate(company_location=str_replace_all(company_location,"[.]",""))

check na,ingredients have 87 na

Code

chocolate %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(list(~sum(is.na(.))))

# A tibble: 1 × 10
    ref company…¹ compa…² revie…³ count…⁴ speci…⁵ cocoa…⁶ ingre…⁷ most_…⁸ rating
  <int>     <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>  <int>
1     0         0       0       0       0       0       0      87       0      0
# … with abbreviated variable names ¹company_manufacturer, ²company_location,
#   ³review_date, ⁴country_of_bean_origin, ⁵specific_bean_origin_or_bar_name,
#   ⁶cocoa_percent, ⁷ingredients, ⁸most_memorable_characteristics

replace all na to unknown

Code

chocolate=chocolate%>% mutate(ingredients=replace_na(ingredients,"unknown"))

5.Visualization

Code

chocolate001=chocolate %>% group_by(review_date) %>% summarise(num=n())

# theme_bw
plot001=ggplot(data=chocolate001, aes(x=review_date, y=num)) +
  geom_line()+
  geom_point()+
  scale_x_continuous(breaks = scales::pretty_breaks(n = 20)) + 
  labs(
    title = "chocolate review number by year",
    subtitle = "from 2006 to 2021",
    caption = "data source: The Manhattan Chocolate Society",
  )+
  ylab("number of chocolate review")+
  xlab("Year")+
  theme_bw()

plot001

Code

chocolate001=chocolate %>% mutate(review_date = ymd(paste(review_date, 1, 1, sep="-")))


ggplot(chocolate001, aes(x = rating, fill = as.factor(review_date))) + 
  geom_density(alpha = .5) + 
  theme_minimal() +
  facet_wrap(~ as.factor(year(review_date))) + 
   xlim(0, 5)+
  guides(fill = FALSE) + labs(x = 'Rating', y = 'Density')

6. model

withn 4 clusters

Code

library(tidymodels)
library(tidyclust)
set.seed(1234)

Code

chocolate_new=chocolate %>% select(-specific_bean_origin_or_bar_name)

Code

kmeans_spec <- k_means(num_clusters = 4) %>%
  set_engine("ClusterR")
kmeans_spec

K Means Cluster Specification (partition)

Main Arguments:
  num_clusters = 4

Computational engine: ClusterR

Code

rec_spec <- recipe(~ ., data = chocolate_new) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>% 
  step_pca(all_numeric_predictors(), threshold = 0.8)

Code

kmeans_wf <- workflow(rec_spec, kmeans_spec)
kmeans_wf

══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: k_means()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_dummy()
• step_zv()
• step_normalize()
• step_pca()

── Model ───────────────────────────────────────────────────────────────────────
K Means Cluster Specification (partition)

Main Arguments:
  num_clusters = 4

Computational engine: ClusterR

Code

kmeans_fit <- fit(kmeans_wf, data = chocolate_new)
kmeans_fit

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: k_means()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_dummy()
• step_zv()
• step_normalize()
• step_pca()

── Model ───────────────────────────────────────────────────────────────────────
KMeans Cluster
 Call: ClusterR::KMeans_rcpp(data = data, clusters = clusters) 
 Data cols: 1843 
 Centroids: 4 
 BSS/SS: 0.003092468 
 SS: 6599959 = 6579549 (WSS) + 20410.16 (BSS)

Code

extract_cluster_assignment(kmeans_fit)

# A tibble: 2,530 × 1
   .cluster 
   <fct>    
 1 Cluster_1
 2 Cluster_1
 3 Cluster_1
 4 Cluster_1
 5 Cluster_1
 6 Cluster_1
 7 Cluster_1
 8 Cluster_1
 9 Cluster_1
10 Cluster_1
# … with 2,520 more rows

Code

extract_centroids(kmeans_fit)

# A tibble: 4 × 1,844
  .cluster PC0001 PC0002  PC0003  PC0004  PC0005  PC0006  PC0007  PC0008  PC0009
  <fct>     <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 Cluster…  0.728 -2.43  -0.836  -0.188   0.276  -1.11    0.739  -1.56   -0.0429
2 Cluster…  0.843 -2.96   0.157   0.127   0.132   0.127  -0.399   0.189   0.486 
3 Cluster…  1.49  -4.98  -0.835  -0.775   0.0760 -1.78    3.00   -0.994   0.415 
4 Cluster… -0.208  0.730 -0.0283 -0.0256 -0.0330 -0.0157  0.0772 -0.0305 -0.115 
# … with 1,834 more variables: PC0010 <dbl>, PC0011 <dbl>, PC0012 <dbl>,
#   PC0013 <dbl>, PC0014 <dbl>, PC0015 <dbl>, PC0016 <dbl>, PC0017 <dbl>,
#   PC0018 <dbl>, PC0019 <dbl>, PC0020 <dbl>, PC0021 <dbl>, PC0022 <dbl>,
#   PC0023 <dbl>, PC0024 <dbl>, PC0025 <dbl>, PC0026 <dbl>, PC0027 <dbl>,
#   PC0028 <dbl>, PC0029 <dbl>, PC0030 <dbl>, PC0031 <dbl>, PC0032 <dbl>,
#   PC0033 <dbl>, PC0034 <dbl>, PC0035 <dbl>, PC0036 <dbl>, PC0037 <dbl>,
#   PC0038 <dbl>, PC0039 <dbl>, PC0040 <dbl>, PC0041 <dbl>, PC0042 <dbl>, …

Code

predict(kmeans_fit, new_data = slice_sample(chocolate_new, n = 10))

# A tibble: 10 × 1
   .pred_cluster
   <fct>        
 1 Cluster_1    
 2 Cluster_1    
 3 Cluster_1    
 4 Cluster_1    
 5 Cluster_1    
 6 Cluster_1    
 7 Cluster_1    
 8 Cluster_2    
 9 Cluster_2    
10 Cluster_1

Code

predict_group=predict(kmeans_fit, new_data = chocolate_new)

Code

chocolate_all=chocolate%>% mutate(predict=predict_group$'.pred_cluster')

Code

chocolate_all %>% group_by(predict) %>% summarise(num=n(),rate=mean(rating))

# A tibble: 4 × 3
  predict     num  rate
  <fct>     <int> <dbl>
1 Cluster_1  2032  3.21
2 Cluster_2   477  3.14
3 Cluster_3    13  3.31
4 Cluster_4     8  2.88

Code

ggplot(chocolate_all, aes(x = rating, fill = as.factor(predict))) + 
  geom_density(alpha = .5) + 
  theme_minimal() +
  facet_wrap(~ as.factor(predict)) + 
   xlim(0, 5)+
  guides(fill = FALSE) + labs(x = 'Rating', y = 'Density')

Reference

https://github.com/rfordatascience/tidytuesday/blob/master/data/2022/2022-01-18/readme.md