Tidy Tuesday: 2024-11-19

Bob's Burgers

By Steve Ewing in Tidy Tuesday

November 18, 2024

First copy the cool graphic that Steven Ponce made.

## 1. LOAD PACKAGES & SETUP ----
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse,         # Easily Install and Load the 'Tidyverse'
  ggtext,            # Improved Text Rendering Support for 'ggplot2'
  showtext,          # Using Fonts More Easily in R Graphs
  janitor,           # Simple Tools for Examining and Cleaning Dirty Data
  skimr,             # Compact and Flexible Summaries of Data
  scales,            # Scale Functions for Visualization
  glue,              # Interpreted String Literals
  bobsburgersR,      # Bob's Burgers Datasets for Data Visualization
  tidytext,          # Text Mining using 'dplyr', 'ggplot2', and Other Tidy Tools 
  textdata,          # Download and Load Various Text Datasets 
  patchwork          # The Composer of Plots
)    

### |- figure size ----
camcorder::gg_record(
  dir    = here::here("temp_plots"),
  device = "png",
  width  =  9,
  height =  10,
  units  = "in",
  dpi    = 320
)

## Warning in camcorder::gg_record(dir = here::here("temp_plots"), device = "png",
## : Writing to a folder that already exists. gg_playback may use more files than
## intended!

### |- resolution ----
showtext_opts(dpi = 320, regular.wt = 300, bold.wt = 800)

bobsburgersR::transcript_data

## # A tibble: 181,031 × 6
##    season episode title        line raw_text                            dialogue
##     <dbl>   <dbl> <chr>       <dbl> <chr>                               <chr>   
##  1      1       1 Human Flesh     1 <NA>                                <NA>    
##  2      1       1 Human Flesh     2 <NA>                                <NA>    
##  3      1       1 Human Flesh     3 <NA>                                <NA>    
##  4      1       1 Human Flesh     4 Listen, pep talk.                   Listen,…
##  5      1       1 Human Flesh     5 Big day today.                      Big day…
##  6      1       1 Human Flesh     6 It's our grand re-re-re-opening.    It's ou…
##  7      1       1 Human Flesh     7 It's labor day weekend, And it loo… It's la…
##  8      1       1 Human Flesh     8 So we have to-- Big day for anothe… So we h…
##  9      1       1 Human Flesh     9 Go ahead, sorry.                    Go ahea…
## 10      1       1 Human Flesh    10 Go ahead, do your pep.              Go ahea…
## # ℹ 181,021 more rows

glimpse(transcript_data)

## Rows: 181,031
## Columns: 6
## $ season   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ episode  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ title    <chr> "Human Flesh", "Human Flesh", "Human Flesh", "Human Flesh", "…
## $ line     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ raw_text <chr> NA, NA, NA, "Listen, pep talk.", "Big day today.", "It's our …
## $ dialogue <chr> NA, NA, NA, "Listen, pep talk.", "Big day today.", "It's our …

skim(transcript_data)

Table: Table 1: Data summary


Name	transcript_data
Number of rows	181031
Number of columns	6
_______________________
Column type frequency:
character	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
title	0	1.00	5	78	272
raw_text	33159	0.82	1	1233	127978
dialogue	33331	0.82	1	1157	127532

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
season	1	9.31	4.00	1	6	10	13	14	▂▃▂▃▇
episode	1	10.54	6.14	1	5	10	16	23	▇▆▇▅▅
line	1	483.43	459.72	1	167	333	557	2004	▇▃▁▁▁

# Calculate metrics 
episode_metrics <- transcript_data |>
  filter(!is.na(dialogue)) |>
  group_by(season, episode) |>
  summarise(
    # Basic dialogue metrics
    dialogue_density = n() / max(line),
    avg_length       = mean(str_length(dialogue)),
    
    # Sentiment analysis - AFINN Sentiment Lexicon
    sentiment_variance = dialogue |>
      tibble(text = _) |>
      unnest_tokens(word, text) |>
      inner_join(get_sentiments("afinn"), by = "word") |>
      pull(value) |>
      var(na.rm = TRUE),
    
    # Word and punctuation metrics  
    unique_words      = dialogue |>
      str_split("\\s+") |>
      unlist() |>
      n_distinct(),
    
    question_ratio    = mean(str_detect(dialogue, "\\?")),
    exclamation_ratio = mean(str_detect(dialogue, "!")),
    
    .groups = "drop"
  ) |>
  # Scale all metrics
  mutate(across(dialogue_density:exclamation_ratio, scale))


# Prepare data for visualization 
episode_metrics_long <- episode_metrics |>
  pivot_longer(
    cols = c(dialogue_density:exclamation_ratio),
    names_to = "metric",
    values_to = "value"
  ) |>
  mutate(
    angle = (as.numeric(factor(metric)) - 1) * 2 * pi / 6,
    hjust = ifelse(angle < pi, 1, 0),
    metric = case_when(
      metric == "dialogue_density" ~ "Dialogue\nDensity",
      metric == "avg_length" ~ "Average\nLength",
      metric == "sentiment_variance" ~ "Sentiment\nVariance",
      metric == "unique_words" ~ "Unique\nWords",
      metric == "question_ratio" ~ "Question\nRatio",
      metric == "exclamation_ratio" ~ "Exclamation\nRatio"
    )
  )

### |- plot aesthetics ----
bkg_col      <- "#f5f5f2"  
title_col    <- "gray20"           
subtitle_col <- "gray20"     
caption_col  <- "gray30"   
text_col     <- "gray20"   

### |-  titles and caption ----
# icons
tt <- str_glue("Source: {{bobsburgersR}}")
li <- str_glue("<span style='font-family:sans'>&#xf08c;</span>")
gh <- str_glue("<span style='font-family:sans'>&#xf09b;</span>")
bs <- str_glue("<span style='font-family:sans'>&#xe671; </span>")

# text
light_purple <- str_glue("<span style='color:#A374C2'>**Light Purple**</span>")
dark_purple  <- str_glue("<span style='color:#8856a7'>**Dark Purple**</span>")

title_text    <- str_glue("Bob's Burgers Episode Fingerprints by Season")
subtitle_text <- str_glue("Analyzing dialogue patterns across seasons<br><br>
                          **Note:** Metrics are standardized (**z-scores**). { light_purple } polygons represent individual episodes.<br>
                          { dark_purple } line shows season average.")
caption_text  <- str_glue("{li} stevenponce &bull; {gh} poncest &bull; #rstats #ggplot2 &bull; {tt}")

### |-  fonts ----
# font_add("fa6-brands", here::here("fonts/6.4.2/Font Awesome 6 Brands-Regular-400.otf"))
font_add_google("Oswald", regular.wt = 400, family = "title")
font_add_google("Source Sans Pro", family = "subtitle")
font_add_google("Source Sans Pro", family = "text")  
font_add_google("Roboto Mono", family = "numbers")  
font_add_google("Noto Sans", regular.wt = 400, family = "caption")
showtext_auto(enable = TRUE)


### |-  plot theme ----
theme_set(theme_minimal(base_size = 14, base_family = "text"))                

theme_update(
  plot.title.position   = "plot",
  plot.caption.position = "plot",
  legend.position       = "plot",
  plot.background       = element_rect(fill = bkg_col, color = bkg_col),
  panel.background      = element_rect(fill = bkg_col, color = bkg_col),
  plot.margin           = margin(t = 10, r = 10, b = 10, l = 10),
  axis.title.x          = element_text(margin = margin(10, 0, 0, 0), size = rel(1.1), 
                                       color = text_col, family = "text", face = "bold", hjust = 0.5),
  axis.title.y          = element_text(margin = margin(0, 10, 0, 0), size = rel(1.1), 
                                       color = text_col, family = "text", face = "bold", hjust = 0.5),
  axis.text             = element_text(size = rel(0.5), color = text_col, family = "text"),
  strip.text            = element_text(size = rel(1), face = "bold", margin = margin(b = 10), family = "text"),
  panel.grid.major      = element_line(color = "gray90", linewidth = 0.2),
  panel.spacing.x       = unit(3, "lines"),  
  panel.spacing.y       = unit(1, "lines"),  
  aspect.ratio          = 1  
)

### |- main plot ---- 
main_plot <- episode_metrics_long |>   
  ggplot(aes(x = metric, y = value)) +

  # Geoms
  # Add grid lines
  geom_hline(yintercept = seq(-2, 7, by = 1), color = "gray90", linewidth = 0.2) +
  
  # Add episode polygons
  geom_polygon(aes(group = interaction(season, episode)),
               fill = "#8856a7",
               alpha = 0.2) +
  
  # Add season average line
  stat_summary(aes(group = season),
               fun = 'mean', 
               geom = "path",
               color = "#8856a7",
               size = 0.8,
               alpha = 0.9,
               na.rm = TRUE) +
  
  # Scales
  scale_x_discrete(expand = expansion(add = 1.2)) +  
  scale_y_continuous(
    expand = expansion(add = c(0.5, 1)),
    limits = c(-2, 7)                 
  ) +
  coord_polar(clip = 'off') +

  # Labs
  labs(
    x = NULL,
    y = NULL,
  ) +
  
  # Facet 
  facet_wrap(~season, nrow = 4, 
             labeller = labeller(season = function(x) paste("Season", x))
  ) +
  
  # Theme
  theme(
    axis.text.y = element_blank(),
    axis.title = element_blank(),
    plot.margin = margin(0, 0, 0, 0)
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

### |- key pattern plot ----
key_patterns_plot <- ggplot() +
  annotate(
    "richtext",
    x = 0,
    y = 0,
    label = glue::glue(
      "<span style='font-family:title;font-size:12pt;color:{title_col}'>**Key Patterns:**</span><br>
      <span style='font-family:subtitle;font-size:9pt;color:{text_col}'>
      • Early seasons (1-3): more experimental patterns<br>
      • Middle seasons (4-8): consistent style<br>
      • Later seasons: refined structure<br>
      • Higher variance: character episodes<br>
      • Higher question ratios: mystery plots
      </span>"
    ),
    fill = NA,
    label.color = NA,
    hjust = 0,
    vjust = 1.2,
  ) +
  theme_void() +
  theme(
    plot.margin = margin(5, 10, 5, 10)      
  )

### |- title & subtitle plot ----
title_plot <- ggplot() +
  labs(
    title = title_text,
    subtitle = subtitle_text
  ) +
  theme_void() +
  theme(
    plot.title      = element_text(
      size          = rel(1.8),
      family        = "title",
      face          = "bold",
      color         = title_col,
      lineheight    = 1.1,
      margin        = margin(t = 5, b = 5)
    ),   
    plot.subtitle   = element_markdown(
      size          = rel(1.1),
      family        = "subtitle",
      color         = subtitle_col,
      lineheight    = 1.1,
      margin        = margin(t = 5, b = 15)
    ),
    plot.background = element_rect(fill = bkg_col, color = bkg_col),
    plot.margin     = margin(10, 10, 0, 10)
  )

### |- combined plot ----

# Define layout design with adjusted areas

# area(t, l, b, r)

# where:
# t = top row position
# l = left column position
# b = bottom row position
# r = right column position

design <- c(
    area(1, 1, 1, 6),      # title area
    area(2, 1, 5, 6),      # main plot area
    area(4, 2, 5, 6)       # key patterns area 
)

combined_plot <- title_plot +  main_plot + key_patterns_plot +
  plot_layout(
    design = design,
    heights = c(0.8, 4, 4, 4, 0.2), 
    widths = c(1, 1, 1, 1, 1, 0.2)
  ) +
  plot_annotation(
    caption = caption_text,
    theme = theme(
      plot.background = element_rect(fill = bkg_col, color = bkg_col),
      plot.margin     = margin(10, 10, 10, 10),
      plot.caption    = element_markdown(
        size          = rel(0.65),
        family        = "caption",
        color         = caption_col,
        lineheight    = 1.1,
        hjust         = 0.5,
        margin        = margin(t = 10, b = 5)
      )
    )
  )

combined_plot

So that’s it! We replicated the original plot. Now lets add on the wikipedia data by episode and see if any of the metrics influence the ratings. It isnt rendering in the document when it gets published so here the png.

finished product

imdb_wikipedia_data <- bobsburgersR::imdb_wikipedia_data

imdb_wikipedia_data

## # A tibble: 275 × 11
##    episode_overall imdb_aired_date  year season episode imdb_title        rating
##              <dbl> <date>          <dbl>  <dbl>   <dbl> <chr>              <dbl>
##  1               1 2011-01-08       2011      1       1 Human Flesh          7.7
##  2               2 2011-01-15       2011      1       2 Crawl Space          8.1
##  3               3 2011-01-22       2011      1       3 Sacred Cow           7.5
##  4               4 2011-02-12       2011      1       4 Sexy Dance Fight…    7.4
##  5               5 2011-02-19       2011      1       5 Hamburger Dinner…    7.5
##  6               6 2011-03-05       2011      1       6 Sheesh! Cab, Bob?    8.3
##  7               7 2011-03-12       2011      1       7 Bed & Breakfast      7.5
##  8               8 2011-03-19       2011      1       8 Art Crawl            7.9
##  9               9 2011-03-26       2011      1       9 Spaghetti Wester…    7.6
## 10              10 2011-04-09       2011      1      10 Burger War           7.7
## # ℹ 265 more rows
## # ℹ 4 more variables: synopsis <chr>, wikipedia_directed_by <chr>,
## #   wikipedia_written_by <chr>, wikipedia_viewers <dbl>

glimpse(imdb_wikipedia_data)

## Rows: 275
## Columns: 11
## $ episode_overall       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ imdb_aired_date       <date> 2011-01-08, 2011-01-15, 2011-01-22, 2011-02-12,…
## $ year                  <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, …
## $ season                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, …
## $ episode               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2,…
## $ imdb_title            <chr> "Human Flesh", "Crawl Space", "Sacred Cow", "Sex…
## $ rating                <dbl> 7.7, 8.1, 7.5, 7.4, 7.5, 8.3, 7.5, 7.9, 7.6, 7.7…
## $ synopsis              <chr> "On top of watching his dysfunctional family cru…
## $ wikipedia_directed_by <chr> "Anthony Chun", "Kyounghee Lim", "Jennifer Coyle…
## $ wikipedia_written_by  <chr> "Loren Bouchard & Jim Dauterive", "Loren Bouchar…
## $ wikipedia_viewers     <dbl> 9.38, 5.07, 4.81, 4.19, 4.87, 4.91, 4.10, 4.43, …

skim(imdb_wikipedia_data)

Table: Table 2: Data summary


Name	imdb_wikipedia_data
Number of rows	275
Number of columns	11
_______________________
Column type frequency:
character	4
Date	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
imdb_title	0	1.00	5	77	275
synopsis	0	1.00	18	872	275
wikipedia_directed_by	6	0.98	3	33	33
wikipedia_written_by	6	0.98	3	77	31

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
imdb_aired_date	0	1	2011-01-08	2024-09-21	2018-03-17	266

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
episode_overall	0	1.00	138.00	79.53	1.00	69.50	138.0	206.5	275.00	▇▇▇▇▇
year	0	1.00	2017.47	3.72	2011.00	2014.00	2018.0	2021.0	2024.00	▆▇▆▇▆
season	0	1.00	7.84	3.80	1.00	5.00	8.0	11.0	14.00	▆▇▅▇▇
episode	0	1.00	10.73	6.19	1.00	5.00	10.0	16.0	23.00	▇▆▇▅▅
rating	2	0.99	7.70	0.44	6.70	7.40	7.6	8.0	9.60	▂▇▃▁▁
wikipedia_viewers	8	0.97	2.35	1.29	0.59	1.47	2.0	3.1	9.38	▇▃▁▁▁

combinded_data <- episode_metrics |> 
  left_join(imdb_wikipedia_data, by = c("season", "episode")) |> 
  mutate(
    rating = as.numeric(rating),
    wikipedia_viewers = as.numeric(wikipedia_viewers)
  )

predictors <- combinded_data %>%
  select(dialogue_density, avg_length, sentiment_variance, 
         unique_words, question_ratio, exclamation_ratio, rating)

glm_model <- glm(rating ~ ., data = predictors, family = gaussian())

summary(glm_model)

## 
## Call:
## glm(formula = rating ~ ., family = gaussian(), data = predictors)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7.694419   0.025434 302.531  < 2e-16 ***
## dialogue_density    0.153914   0.048316   3.186 0.001618 ** 
## avg_length         -0.197340   0.056095  -3.518 0.000512 ***
## sentiment_variance  0.025194   0.026505   0.951 0.342711    
## unique_words        0.002158   0.029711   0.073 0.942156    
## question_ratio      0.043562   0.032210   1.352 0.177386    
## exclamation_ratio   0.138396   0.032051   4.318 2.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1752897)
## 
##     Null deviance: 53.244  on 270  degrees of freedom
## Residual deviance: 46.276  on 264  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 306.08
## 
## Number of Fisher Scoring iterations: 2

Looks like people like them short dense and punchy.

predictors2 <- combinded_data %>%
  select(dialogue_density, avg_length, exclamation_ratio, rating)

glm_model2 <- glm(rating ~ ., data = predictors2, family = gaussian())

summary(glm_model2)

## 
## Call:
## glm(formula = rating ~ ., family = gaussian(), data = predictors2)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        7.69464    0.02541 302.837  < 2e-16 ***
## dialogue_density   0.15753    0.04572   3.446 0.000661 ***
## avg_length        -0.16881    0.04760  -3.546 0.000461 ***
## exclamation_ratio  0.14059    0.02891   4.864 1.97e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1749506)
## 
##     Null deviance: 53.244  on 270  degrees of freedom
## Residual deviance: 46.712  on 267  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 302.61
## 
## Number of Fisher Scoring iterations: 2

Posted on:: November 18, 2024

Length:: 12 minute read, 2360 words

Categories:: Tidy Tuesday

Tags:: R

See Also:: Teun van den Brand's Plots; Another Kyle Walker Map; Tidy Tuesday Customs and Border Protection