Look at the available datasets

library(tidytuesdayR)
#This will open up in the help window
tidytuesdayR::tt_available()

What was your dataset?

Load your dataset in with the function below. The input is the date the dataset was issued. You should be able to get this from the tt_available() function.

#incoming data comes in as a list
datasets <- tidytuesdayR::tt_load("2020-08-25")

## --- Compiling #TidyTuesday Information for 2020-08-25 ----

## --- There is 1 file available ---

## --- Starting Download ---

## 
##  Downloading file 1 of 1: `chopped.tsv`

## --- Download complete ---

#show the names of the individual datasets
names(datasets)

## [1] "chopped"

chopped <- datasets$chopped

Initial EDA

visdat::vis_dat(chopped)

skimr::skim(chopped)

Data summary
Name	chopped
Number of rows	569
Number of columns	21
_______________________
Column type frequency:
character	17
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
episode_name	0	1.00	6	77	568
episode_notes	113	0.80	12	830	455
air_date	0	1.00	11	18	564
judge1	1	1.00	9	18	37
judge2	1	1.00	9	18	27
judge3	1	1.00	6	19	95
appetizer	1	1.00	18	135	568
entree	1	1.00	34	126	568
dessert	1	1.00	28	117	568
contestant1	1	1.00	6	45	566
contestant1_info	13	0.98	2	114	502
contestant2	1	1.00	7	48	561
contestant2_info	14	0.98	2	119	510
contestant3	1	1.00	6	53	556
contestant3_info	14	0.98	2	120	505
contestant4	1	1.00	8	45	507
contestant4_info	14	0.98	2	140	494

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
season	0	1.00	23.12	12.96	1.0	12.0	23.0	35.0	45.0	▇▇▇▇▇
season_episode	0	1.00	7.02	3.98	1.0	4.0	7.0	10.0	20.0	▇▇▇▂▁
series_episode	0	1.00	284.85	164.16	1.0	143.0	285.0	427.0	567.0	▇▇▇▇▇
episode_rating	105	0.82	8.38	0.48	5.5	8.2	8.5	8.7	9.2	▁▁▂▅▇

What was your question?

Given your inital exploration of the data, what was the question you wanted to answer?

Which Judge is associated with the highest ratings?

Fixing the data

Need to recode some names here. Also collapse judge1, judge2 and judge3 into a single column using pivot_longer().

chopped_collapsed <- chopped %>%
  tidyr::pivot_longer(cols=contains("judge"), names_to="judge", values_to="name") %>%
  mutate(name=fct_recode(name, `Aarón Sanchez`="Aarón Sánchez", `Amanda Freitag` = "Amanda Freita"))

Frequency of Judge appearances

Who is the most frequent judge on Chopped?

chopped_counts <- chopped_collapsed %>% 
  count(name) %>%
  arrange(desc(n))

knitr::kable(chopped_counts)

name	n
Alex Guarnaschelli	208
Amanda Freitag	206
Scott Conant	194
Marc Murphy	191
Geoffrey Zakarian	183
Aarón Sanchez	140
Maneet Chauhan	128
Chris Santos	120
Marcus Samuelsson	78
Chris Santo	65
Martha Stewart	16
Angie Mar	7
Elizabeth Karmel	6
Tiffani Faison	6
Alex Stupak	5
Alton Brown	5
Christian Petroni	5
Seamus Mullen	4
Zakary Pelaccio	4
Bobby Flay	3
Christina Tosi	3
Eddie Jackson	3
Giorgio Rapicavoli	3
Jody Williams	3
Jonathon Sawyer	3
Jordan Andino	3
Sue Torres	3
Zac Youn	3
NA	3
Adam Sobel	2
Anne Burrell	2
Dale Tal	2
David Gua	2
Edi Frauneder	2
Geoffrey Zacharian	2
Jeff Mauro	2
John Li	2
Lee Anne Wong	2
Michelle Bernstein	2
Roger Mookin	2
Sam Ka	2
Adam Moskowitz	1
Ali Khan	1
Amanda Frietag	1
Andrew Zimmern	1
Ayesha Nurdjaja	1
Barry Williams	1
Bruno DiFabio	1
Cheryl Barbara	1
Chris Cheun	1
Christopher Knight	1
Claudia Flemin	1
Craig Samuel	1
David Burtka	1
David Loewenber	1
Eduardo Garcia	1
Edward L	1
Einat Admony	1
Elizabeth Heiskell	1
Erik Ramirez	1
Esther Choi	1
Evan Funk	1
Eve Plumb	1
Florian Bellanger	1
George Men	1
Greg Koch	1
Hans Röckenwagner	1
Hooni Kim	1
Hugh Acheson	1
James Briscion	1
James Tahhan	1
Jamie Bissonnett	1
Janine Booth	1
Jason Kieffer	1
Jet Tila	1
Jody William	1
John Suley	1
Johnny Iuzzini	1
Jose Garces	1
Joseph “JJ” Johnson	1
Joseph Brown	1
Josh Capon	1
Kari Underly	1
Katrina Markoff	1
Ken Oringer	1
Kimbal Musk	1
Laura Vital	1
Lauren Gerri	1
Leah Cohen	1
Liz Thorp	1
Maneet Chauhaun	1
Marc Forgion	1
Marco Canora	1
Mark Bittman	1
Maureen McCormick	1
Michael Chernow	1
Michael Psilaki	1
Mike Lookinlan	1
Missy Robbin	1
Missy Robbins	1
Moe Cason	1
Nancy Silverton	1
Natasha Case	1
Nick Anderer	1
Peter Oleyer	1
Ray Garcia	1
Ray Lamp	1
Rocco DiSpirito	1
Ron Ben-Israel	1
Rose McGowan	1
Silvena Row	1
Spike Mendelsohn	1
Stephanie Izar	1
Stuart O’Keeffe	1
Susan Feniger	1
Susan Olsen	1
Sylvia Weinstock	1
Thiago Silva	1
Tino Feliciano	1
Tyler Malek	1
Ulrich Koberstein	1
Valerie Bertinelli	1
Wylie Dufresn	1

Boxplot of Judge versus Ratings

chopped_counts_high <- chopped_counts %>% filter(n > 4) 

frequent_judges <- chopped_counts_high %>% pull(name)

chopped_collapsed %>%
  tidyr::drop_na(episode_rating) %>%
  filter(name %in% frequent_judges) %>%
  mutate(name=fct_reorder(name, episode_rating, median)) %>%
  ggplot() +
  aes(x=name, y=episode_rating, fill=name) +
  geom_boxplot() + 
  geom_text(aes(x = name, y= 10, label=n), data=chopped_counts_high) +
  ylim(c(5.5, 10.5)) +
  theme(legend.position = "none") +
  coord_flip() +
  labs(title = "Chopped Judges by Median Rating", 
       subtitle = "Alton and Tiffani FTW")

Alex’s Progression

Alex Guarnaschelli is the most frequent judge. Here’s her overall progression over episodes.

chopped_collapsed %>%
  filter(name== "Alex Guarnaschelli") %>%
  ggplot() +
  aes(x=series_episode, y=episode_rating, color=season) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 33 rows containing non-finite values (stat_smooth).

## Warning: Removed 33 rows containing missing values (geom_point).

Judges versus ratings

Doing something similar for those judges who have 16 or more appearances:

frequent_judges <-  chopped_counts %>% 
  filter(n > 15) %>% 
  pull(name)

chopped_collapsed %>%
  filter(name %in% frequent_judges) %>%
  ggplot() +
  aes(x=series_episode, y=episode_rating) +
  geom_point() +
  geom_smooth(aes(color=name)) +
  facet_wrap(~name) +
  ylim(c(6, 9.5)) +
  theme(legend.position = "none")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 265 rows containing non-finite values (stat_smooth).

## Warning: Removed 265 rows containing missing values (geom_point).

Chopped Episodes

Ted Laderas

2020-08-25