library(tidytuesdayR)
#This will open up in the help window
tidytuesdayR::tt_available()
Load your dataset in with the function below. The input is the date the dataset was issued. You should be able to get this from the tt_available()
function.
#incoming data comes in as a list
datasets <- tidytuesdayR::tt_load("2020-08-25")
## --- Compiling #TidyTuesday Information for 2020-08-25 ----
## --- There is 1 file available ---
## --- Starting Download ---
##
## Downloading file 1 of 1: `chopped.tsv`
## --- Download complete ---
#show the names of the individual datasets
names(datasets)
## [1] "chopped"
chopped <- datasets$chopped
visdat::vis_dat(chopped)
skimr::skim(chopped)
Name | chopped |
Number of rows | 569 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
character | 17 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
episode_name | 0 | 1.00 | 6 | 77 | 0 | 568 | 0 |
episode_notes | 113 | 0.80 | 12 | 830 | 0 | 455 | 0 |
air_date | 0 | 1.00 | 11 | 18 | 0 | 564 | 0 |
judge1 | 1 | 1.00 | 9 | 18 | 0 | 37 | 0 |
judge2 | 1 | 1.00 | 9 | 18 | 0 | 27 | 0 |
judge3 | 1 | 1.00 | 6 | 19 | 0 | 95 | 0 |
appetizer | 1 | 1.00 | 18 | 135 | 0 | 568 | 0 |
entree | 1 | 1.00 | 34 | 126 | 0 | 568 | 0 |
dessert | 1 | 1.00 | 28 | 117 | 0 | 568 | 0 |
contestant1 | 1 | 1.00 | 6 | 45 | 0 | 566 | 0 |
contestant1_info | 13 | 0.98 | 2 | 114 | 0 | 502 | 0 |
contestant2 | 1 | 1.00 | 7 | 48 | 0 | 561 | 0 |
contestant2_info | 14 | 0.98 | 2 | 119 | 0 | 510 | 0 |
contestant3 | 1 | 1.00 | 6 | 53 | 0 | 556 | 0 |
contestant3_info | 14 | 0.98 | 2 | 120 | 0 | 505 | 0 |
contestant4 | 1 | 1.00 | 8 | 45 | 0 | 507 | 0 |
contestant4_info | 14 | 0.98 | 2 | 140 | 0 | 494 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
season | 0 | 1.00 | 23.12 | 12.96 | 1.0 | 12.0 | 23.0 | 35.0 | 45.0 | ▇▇▇▇▇ |
season_episode | 0 | 1.00 | 7.02 | 3.98 | 1.0 | 4.0 | 7.0 | 10.0 | 20.0 | ▇▇▇▂▁ |
series_episode | 0 | 1.00 | 284.85 | 164.16 | 1.0 | 143.0 | 285.0 | 427.0 | 567.0 | ▇▇▇▇▇ |
episode_rating | 105 | 0.82 | 8.38 | 0.48 | 5.5 | 8.2 | 8.5 | 8.7 | 9.2 | ▁▁▂▅▇ |
Given your inital exploration of the data, what was the question you wanted to answer?
Which Judge is associated with the highest ratings?
Need to recode some names here. Also collapse judge1
, judge2
and judge3
into a single column using pivot_longer()
.
chopped_collapsed <- chopped %>%
tidyr::pivot_longer(cols=contains("judge"), names_to="judge", values_to="name") %>%
mutate(name=fct_recode(name, `Aarón Sanchez`="Aarón Sánchez", `Amanda Freitag` = "Amanda Freita"))
Who is the most frequent judge on Chopped?
chopped_counts <- chopped_collapsed %>%
count(name) %>%
arrange(desc(n))
knitr::kable(chopped_counts)
name | n |
---|---|
Alex Guarnaschelli | 208 |
Amanda Freitag | 206 |
Scott Conant | 194 |
Marc Murphy | 191 |
Geoffrey Zakarian | 183 |
Aarón Sanchez | 140 |
Maneet Chauhan | 128 |
Chris Santos | 120 |
Marcus Samuelsson | 78 |
Chris Santo | 65 |
Martha Stewart | 16 |
Angie Mar | 7 |
Elizabeth Karmel | 6 |
Tiffani Faison | 6 |
Alex Stupak | 5 |
Alton Brown | 5 |
Christian Petroni | 5 |
Seamus Mullen | 4 |
Zakary Pelaccio | 4 |
Bobby Flay | 3 |
Christina Tosi | 3 |
Eddie Jackson | 3 |
Giorgio Rapicavoli | 3 |
Jody Williams | 3 |
Jonathon Sawyer | 3 |
Jordan Andino | 3 |
Sue Torres | 3 |
Zac Youn | 3 |
NA | 3 |
Adam Sobel | 2 |
Anne Burrell | 2 |
Dale Tal | 2 |
David Gua | 2 |
Edi Frauneder | 2 |
Geoffrey Zacharian | 2 |
Jeff Mauro | 2 |
John Li | 2 |
Lee Anne Wong | 2 |
Michelle Bernstein | 2 |
Roger Mookin | 2 |
Sam Ka | 2 |
Adam Moskowitz | 1 |
Ali Khan | 1 |
Amanda Frietag | 1 |
Andrew Zimmern | 1 |
Ayesha Nurdjaja | 1 |
Barry Williams | 1 |
Bruno DiFabio | 1 |
Cheryl Barbara | 1 |
Chris Cheun | 1 |
Christopher Knight | 1 |
Claudia Flemin | 1 |
Craig Samuel | 1 |
David Burtka | 1 |
David Loewenber | 1 |
Eduardo Garcia | 1 |
Edward L | 1 |
Einat Admony | 1 |
Elizabeth Heiskell | 1 |
Erik Ramirez | 1 |
Esther Choi | 1 |
Evan Funk | 1 |
Eve Plumb | 1 |
Florian Bellanger | 1 |
George Men | 1 |
Greg Koch | 1 |
Hans Röckenwagner | 1 |
Hooni Kim | 1 |
Hugh Acheson | 1 |
James Briscion | 1 |
James Tahhan | 1 |
Jamie Bissonnett | 1 |
Janine Booth | 1 |
Jason Kieffer | 1 |
Jet Tila | 1 |
Jody William | 1 |
John Suley | 1 |
Johnny Iuzzini | 1 |
Jose Garces | 1 |
Joseph “JJ” Johnson | 1 |
Joseph Brown | 1 |
Josh Capon | 1 |
Kari Underly | 1 |
Katrina Markoff | 1 |
Ken Oringer | 1 |
Kimbal Musk | 1 |
Laura Vital | 1 |
Lauren Gerri | 1 |
Leah Cohen | 1 |
Liz Thorp | 1 |
Maneet Chauhaun | 1 |
Marc Forgion | 1 |
Marco Canora | 1 |
Mark Bittman | 1 |
Maureen McCormick | 1 |
Michael Chernow | 1 |
Michael Psilaki | 1 |
Mike Lookinlan | 1 |
Missy Robbin | 1 |
Missy Robbins | 1 |
Moe Cason | 1 |
Nancy Silverton | 1 |
Natasha Case | 1 |
Nick Anderer | 1 |
Peter Oleyer | 1 |
Ray Garcia | 1 |
Ray Lamp | 1 |
Rocco DiSpirito | 1 |
Ron Ben-Israel | 1 |
Rose McGowan | 1 |
Silvena Row | 1 |
Spike Mendelsohn | 1 |
Stephanie Izar | 1 |
Stuart O’Keeffe | 1 |
Susan Feniger | 1 |
Susan Olsen | 1 |
Sylvia Weinstock | 1 |
Thiago Silva | 1 |
Tino Feliciano | 1 |
Tyler Malek | 1 |
Ulrich Koberstein | 1 |
Valerie Bertinelli | 1 |
Wylie Dufresn | 1 |
chopped_counts_high <- chopped_counts %>% filter(n > 4)
frequent_judges <- chopped_counts_high %>% pull(name)
chopped_collapsed %>%
tidyr::drop_na(episode_rating) %>%
filter(name %in% frequent_judges) %>%
mutate(name=fct_reorder(name, episode_rating, median)) %>%
ggplot() +
aes(x=name, y=episode_rating, fill=name) +
geom_boxplot() +
geom_text(aes(x = name, y= 10, label=n), data=chopped_counts_high) +
ylim(c(5.5, 10.5)) +
theme(legend.position = "none") +
coord_flip() +
labs(title = "Chopped Judges by Median Rating",
subtitle = "Alton and Tiffani FTW")
Alex Guarnaschelli is the most frequent judge. Here’s her overall progression over episodes.
chopped_collapsed %>%
filter(name== "Alex Guarnaschelli") %>%
ggplot() +
aes(x=series_episode, y=episode_rating, color=season) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 33 rows containing non-finite values (stat_smooth).
## Warning: Removed 33 rows containing missing values (geom_point).
Doing something similar for those judges who have 16 or more appearances:
frequent_judges <- chopped_counts %>%
filter(n > 15) %>%
pull(name)
chopped_collapsed %>%
filter(name %in% frequent_judges) %>%
ggplot() +
aes(x=series_episode, y=episode_rating) +
geom_point() +
geom_smooth(aes(color=name)) +
facet_wrap(~name) +
ylim(c(6, 9.5)) +
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 265 rows containing non-finite values (stat_smooth).
## Warning: Removed 265 rows containing missing values (geom_point).