## Data wrangling
library(tidyr)
library(dplyr)
library(readr)
library(lay)
library(janitor)
library(stringr)
## Dealing with country codes
library(countrycode)
## Plotting
library(ggplot2)
library(shadowtext)
library(gt)
library(scales)
Introduction: Is there an easy answer?
These type of record breaking headlines are a mainstay of Olympic coverage, but does the medal tally actually give us a true picture of a nation’s Olympic achievements? Is this data suitable to compare countries over time? Although I don’t have the physical prowess to actually participate in the Olympics, maybe I can still analyse it. Let’s dig into the Olympic data bank and search for answers…
The data
For our source of Olympic knowledge we use the Kaggle dataset “120 years of Olympic history: athletes and results” scraped by rgriffin from http://www.sports-reference.com/. This data provides us with a wealth of knowledge on the athletes participating in the Olympics from Athens 1896 until Rio 2016.
The basics: Clean the data and add some context
Let’s load all the required packages. For data wrangling we have a bunch of packages from the tidyverse
: tidyr
, stringr
, readr
, and dplyr
. We also bring in the packages lay
, and janitor
which help with wrangling. Next we bring in countrycode
, which allows us to more easily switch between IOC country codes and the ISO3c standard (yes they are different!). Finally, for data visualisation, we have ggplot2
, shadowtext
, scales
and gt
(for tables).
That’s quite a few packages and I’ll try to make it clear where each package is needed in the code.
Let’s get started on our Olympic investigation. The first thing to note is that we have data on all athletes, even those that did not win a medal. We also have data on each individual athlete within a team sport (e.g. basketball). This is super interesting data, but if we want to know the best performing countries, we’ll need to adapt this data a bit to show national medal tallies.
## Load data
<- readr::read_csv("./data/athlete_events.csv", show_col_type = FALSE)
athletes
athletes
# A tibble: 271,116 × 15
ID Name Sex Age Height Weight Team NOC Games Year Season City
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
1 1 A Dijia… M 24 180 80 China CHN 1992… 1992 Summer Barc…
2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…
3 3 Gunnar … M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
4 4 Edgar L… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
5 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
6 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
7 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
8 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
9 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
10 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
# ℹ 271,106 more rows
# ℹ 3 more variables: Sport <chr>, Event <chr>, Medal <chr>
For simplicity, let’s just focus on summer Olympics and stick with post-War Olympics (1948 onwards). In our first step of data wrangling, we adjust the data so that we count the number of each medal type for each country at each games. At this stage, data are still in long format.
Because most other datasets will use ISO3c codes as country identifiers, I already substitute IOC codes to ISO3c at this early stage.
Show the code
<- athletes |>
athletes_clean ## Focus only on Summer Olympics since London (1948)
filter(Season == "Summer" & Year >= 1948) |>
## For simplicity, let's classify NA as medal "None"
## Keeping None ensures we don't exclude countries that participated in the Olympics
## but didn't win any medals
mutate(Medal = replace_na(Medal, "None"),
## NOTE: There are some country names that have "-1" in the name
## We fix these typos
Team = stringr::str_remove_all(Team, pattern = "-[0-9]*"),
## Some other known typos
Team = case_when(Team == "Nadine" ~ "Netherlands",
== "Don Schufro" ~ "Denmark",
Team == "Rush VII" ~ "Sweden",
Team %in% c("Gem", "Gem IV", "Yeoman") ~ "Bahamas",
Team == "Tango" ~ "Argentina",
Team == "Nirefs" ~ "Greece",
Team %in% c("Pan", "Sirene", "Encore") ~ "Norway",
Team %in% c("Symphony", "Espadarte", "Ma'Lindo") ~ "Portugal",
Team == "June Climene" ~ "Singapore",
Team == "Kurush II" ~ "Cuba",
Team %in% c("Pam", "Bermudes") ~ "Bermuda",
Team == "Hirondelle" ~ "Monaco",
Team TRUE ~ Team)) |>
## We're much more familiar with ISO codes (and need them to join other data below)
## So we'll take this opportunity to switch to ISO3c codes
## Convert IOC country codes to ISO3C
mutate(
## In most cases, we can convert straight from IOC codes to ISO3c
ISO3c_NOC = countrycode::countrycode(NOC, origin = "ioc", destination = "iso3c"),
## In some cases, the IOC codes are not recognised so we try using country names
ISO3c_name = countrycode(Team, origin = "country.name", destination = "iso3c"),
## There are a few cases where the countrycode package struggles and we add codes manually
ISO3c = case_when(
## countrycode treats Soviet Union and Russia as the same
## We want to keep these separate
== "URS" ~ "SUN",
NOC ## Use direct translation from IOC codes where available
!is.na(ISO3c_NOC) ~ ISO3c_NOC,
## Otherwise, use estimates based on the English name (less reliable)
is.na(ISO3c_NOC) & !is.na(ISO3c_name) ~ ISO3c_name,
## In a few specific cases (Cold War era states), we manually specify the country code
grepl(x = Team, pattern = "Czechoslovakia") ~ "CSK",
grepl(x = Team, pattern = "East Germany") ~ "DDR",
## Rhodesia converted to Zimbabwe
== "RHO" ~ "ZWE",
NOC grepl(x = Team, pattern = "Netherlands Antilles") ~ "ANT",
grepl(x = Team, pattern = "Serbia and Montenegro") ~ "SCG",
grepl(x = Team, pattern = "Kosovo") ~ "XKX", ## Based on WorldBank
## If we can't find a ISO3c match we revert back to the IOC codes
## This includes athletes competing as refugees or individuals
TRUE ~ NOC)) |>
## When medals are won by a *team* in an event we only want to count the medal once
group_by(Year, Season, ISO3c, Event, Medal) |>
slice(1) |>
ungroup()
<- athletes_clean |>
medals_long ## Count the number of each medal won by each country at each olympics
group_by(Year, Season, ISO3c, Medal) |>
summarise(n = n(),
country = first(Team),
.groups = "drop")
medals_long
# A tibble: 4,691 × 6
Year Season ISO3c Medal n country
<dbl> <chr> <chr> <chr> <int> <chr>
1 1948 Summer AFG None 2 Afghanistan
2 1948 Summer ARG Bronze 1 Argentina
3 1948 Summer ARG Gold 3 Argentina
4 1948 Summer ARG None 96 Argentina
5 1948 Summer ARG Silver 3 Argentina
6 1948 Summer AUS Bronze 5 Australia
7 1948 Summer AUS Gold 2 Australia
8 1948 Summer AUS None 43 Australia
9 1948 Summer AUS Silver 6 Australia
10 1948 Summer AUT Bronze 4 Austria
# ℹ 4,681 more rows
This is a good start, but we would be better served by data in wide format with one column for each medal type. This is how we’re more accustomed to seeing Olympic data.
Show the code
<- medals_long |>
medals_wide ## Pivot wider to have column for number of medals won for each type
::pivot_wider(names_from = Medal, values_from = n) |>
tidyr## Some countries won't have any data for a medal type (i.e. they won none).
## Replace these NAs with 0
mutate(across(c(None, Gold, Silver, Bronze), \(x) replace_na(x, 0))) |>
## Order columns more logically
select(Year, Season, ISO3c, country, Gold, Silver, Bronze) |>
## Arrange to show best performing country at each games using traditional ranking
arrange(desc(Year), Season, desc(Gold), desc(Silver), desc(Bronze)) |>
## Count the total number of medals
mutate(total_medals = lay(pick(c(Gold, Silver, Bronze)), \(x) sum(x)))
medals_wide
# A tibble: 2,463 × 8
Year Season ISO3c country Gold Silver Bronze total_medals
<dbl> <chr> <chr> <chr> <int> <int> <int> <int>
1 2016 Summer USA United States 46 37 38 121
2 2016 Summer GBR Great Britain 27 23 17 67
3 2016 Summer CHN China 26 18 26 70
4 2016 Summer RUS Russia 19 17 20 56
5 2016 Summer DEU Germany 17 10 15 42
6 2016 Summer JPN Japan 12 8 21 41
7 2016 Summer FRA France 10 18 14 42
8 2016 Summer KOR South Korea 9 3 9 21
9 2016 Summer ITA Italy 8 12 8 28
10 2016 Summer AUS Australia 8 11 10 29
# ℹ 2,453 more rows
With this data we can now look at the best Olympic performances since 1948. The table below is generated by gt
. See my previous blog post for more detail to understand how such tables are created.
Find out the interesting story of the Unified Team at the 1992 Olympics here
Show the code
<- readr::read_csv("data/Country_Flags.csv", show_col_type = FALSE) %>%
flag_db #Convert country names into 3-letter country codes
mutate(Code_raw = countrycode(sourcevar = Country, origin = "country.name", destination = "iso3c", warn = FALSE),
## There are a few cases that need to be specified manually
Code = case_when(Country == "Soviet Union" ~ "SUN",
== "East Germany" ~ "DDR",
Country == "Yugoslavia" ~ "YUG",
Country == "Olympics" ~ "EUN",
Country TRUE ~ Code_raw)) %>%
select(Code, flag_URL = ImageURL)
## Data to convert Years into a convenient games name (e.g. Rio 2016)
<- readr::read_csv("data/games_names.csv", show_col_type = FALSE)
games_names
## Find the top 10 medal tallies of all time
<- medals_wide |>
plot_data arrange(desc(Gold), desc(Silver), desc(Bronze)) |>
slice(1:10) |>
mutate(rank = 1:n()) |>
## Join in info on the name of each game and the flag image URLS
left_join(games_names, by = "Year") |>
left_join(flag_db, by = c("ISO3c" = "Code")) |>
select(rank, Game, flag_URL, country, Gold, Silver, Bronze) |>
mutate(total_medals = lay(pick(c(Gold, Silver, Bronze)), sum))
## Create dynamic palettes to colour Gold, Silver, and Bronze columns
<- col_numeric(c("grey99", "#fcc861"), domain = c(0, max(plot_data$Gold)), alpha = 0.75)
perc_palette_Gold <- col_numeric(c("grey99", "#e5e5e5"), domain = c(0, max(plot_data$Silver)), alpha = 0.75)
perc_palette_Silver <- col_numeric(c("grey99", "#dcb386"), domain = c(0, max(plot_data$Bronze)), alpha = 0.75)
perc_palette_Bronze
## Create the gt table
|>
plot_data gt() |>
cols_label(rank = "",
Game = "Games",
country = "Country",
Gold = "Gold",
Silver = "Silver",
Bronze = "Bronze",
total_medals = "Total") %>%
tab_header(title = md("Olympic medal tally"),
subtitle = "Best ever national performances (1948 - 2016)") %>%
tab_source_note(source_note = "Data: www.sports-reference.com") %>%
tab_style(
locations = cells_column_labels(columns = everything()),
style = list(
cell_borders(sides = "bottom", weight = px(3)),
cell_text(weight = "bold")
)%>%
) tab_style(
locations = cells_title(groups = "title"),
style = list(
cell_text(weight = "bold", size = 24)
)%>%
) data_color(columns = c(Gold),
fn = perc_palette_Gold) %>%
data_color(columns = c(Silver),
fn = perc_palette_Silver) %>%
data_color(columns = c(Bronze),
fn = perc_palette_Bronze) %>%
opt_all_caps() %>%
opt_table_font(
font = list(
google_font("Chivo"),
default_fonts()
)%>%
) cols_width(rank ~ px(40),
c(country) ~ px(150),
c(Gold,
Silver,~ px(100)) %>%
Bronze) tab_options(
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
data_row.padding = px(3),
source_notes.font.size = 12,
heading.align = "left") |>
::text_transform(
gt#Apply a function to a column
locations = cells_body(c(flag_URL)),
fn = function(x) {
#Return an image of set dimensions
web_image(
url = x,
height = 12
)
}%>%
) #Hide column header flag_URL and reduce width
cols_width(c(flag_URL) ~ px(30)) %>%
cols_label(flag_URL = "")
Olympic medal tally | |||||||
Best ever national performances (1948 - 2016) | |||||||
Games | Country | Gold | Silver | Bronze | Total | ||
---|---|---|---|---|---|---|---|
1 | Los Angeles 1984 | United States | 82 | 61 | 30 | 173 | |
2 | Moscow 1980 | Soviet Union | 80 | 69 | 46 | 195 | |
3 | Seoul 1988 | Soviet Union | 54 | 31 | 46 | 131 | |
4 | Beijing 2008 | China | 51 | 21 | 28 | 100 | |
5 | Munich 1972 | Soviet Union | 50 | 27 | 22 | 99 | |
6 | Montreal 1976 | Soviet Union | 49 | 41 | 35 | 125 | |
7 | Moscow 1980 | East Germany | 47 | 37 | 42 | 126 | |
8 | Rio 2016 | United States | 46 | 37 | 38 | 121 | |
9 | London 2012 | United States | 46 | 28 | 29 | 103 | |
10 | Barcelona 1992 | Unified Team | 45 | 38 | 29 | 112 | |
Data: www.sports-reference.com |
Problem 1: The Problem of (medal) Inflation
At first the topic of Olympic success seems pretty straight forward. More medals equals more success. But if we want to compare medal tallies between years we necessarily expect that the number of possible medals stays the same. Otherwise a country might do ‘better’ just because they participated in a particularly lucrative Olympics. Does this assumption of a fixed medal pool hold up?
Let’s go back to our data and look at the total number of Gold, Silver, and Bronze medals awarded each year.
<- medals_wide |>
number_medals group_by(Year) |>
summarise(across(c(Gold, Silver, Bronze), sum)) |>
## Convert to long format for plotting
::pivot_longer(Gold:Bronze, names_to = "Medal", values_to = "n_medals") tidyr
Show the code
<- number_medals |>
plot_data mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze")))
<- plot_data |>
text_data group_by(Year) |>
summarise(total_medals = sum(n_medals))
ggplot() +
geom_bar(data = plot_data,
aes(x = as.factor(Year), y = n_medals, fill = Medal),
position = "stack", stat = "identity", colour = "grey20") +
geom_text(data = text_data,
aes(x = as.factor(Year), y = total_medals,
label = Year), angle = 90,
hjust = 1.3, fontface = "bold", colour = "grey10",
size = 4, family = "Chivo") +
::geom_shadowtext(data = plot_data |>
shadowtextslice(1:3) |>
arrange(desc(Medal)) |>
mutate(y = cumsum(n_medals)),
aes(x = as.factor(Year),
y = y, label = Medal,
colour = Medal),
bg.colour="grey10",
size = 4,
angle = 90, hjust = 1, vjust = -1.75,
fontface = "bold", family = "Chivo") +
labs(title = "The Problem of (medal) Inflation",
subtitle = "Number of medals available at the Olympics has more than doubles since 1948") +
scale_fill_manual(values = c("#fcc861", "#e5e5e5", "#dcb386")) +
scale_colour_manual(values = c("#fcc861", "#e5e5e5", "#dcb386")) +
scale_y_continuous(expand = c(0, 0, 0, 0),
name = "Number of medals", limits = c(NA, 1000)) +
scale_x_discrete(expand = c(0.075, 0.075, 0.075, 0.075)) +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 15, colour = "grey10",
face = "bold", family = "Chivo"),
axis.text.y = element_text(size = 10, colour = "grey10",
face = "bold", family = "Chivo"),
plot.margin = margin(t = 10, b = 10, l = 15, r = 15),
plot.title = element_text(size = 20, face = "bold", family = "Chivo"),
plot.subtitle = element_text(size = 15, family = "Chivo"),
plot.background = element_rect(colour = "black"))
There were a total of 973 medals available in Rio 2016 (the most recent Olympics available in the data), which is 132 extra medals compared to Atlanta 1996 (841 medals) and more than double that available at London 1948 (439 medals).
Simply looking at medal counts is therefore misleading. Instead, let’s consider the percentage of medals claimed at a given Olympics.
Show the code
<- medals_long |>
medals_wide_prop ## Join in total number of medals
left_join(number_medals, by = c("Year", "Medal")) |>
## Determine % of each medal type that a country won at those games
## We keep 'None' but don't calculate a proportion
## None is kept to keep countries that didn't manage to win a medal
mutate(perc_medal = case_when(Medal != "None" ~ (n/n_medals)*100,
TRUE ~ NA)) |>
select(Year, Season, ISO3c, country, Medal, perc_medal) |>
## Pivot wider to have column for number of medals won for each type
::pivot_wider(names_from = Medal, values_from = perc_medal, names_prefix = "perc_") |>
tidyr## Some countries won't have any data for a medal type. Replace this with 0
mutate(across(c(perc_None, perc_Bronze, perc_Silver, perc_Gold), \(x) replace_na(x, 0))) |>
## Order columns more logically
select(Year, Season, ISO3c, country, perc_Gold, perc_Silver, perc_Bronze) |>
## Arrange to show best performing country at each games using percentage ranking
arrange(desc(Year), Season, desc(perc_Gold), desc(perc_Silver), desc(perc_Bronze))
## Also join in data on % on total medals won
<- medals_wide |>
total_prop left_join(number_medals |>
group_by(Year) |>
summarise(total = sum(n_medals)), by = "Year") |>
mutate(perc_Total = (total_medals/total)*100) |>
select(Year, ISO3c, perc_Total)
<- medals_wide_prop |>
medals_wide_prop left_join(total_prop, by = c("Year", "ISO3c"))
medals_wide_prop
# A tibble: 2,463 × 8
Year Season ISO3c country perc_Gold perc_Silver perc_Bronze perc_Total
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2016 Summer USA United States 15.0 12.1 10.6 12.4
2 2016 Summer GBR Great Britain 8.79 7.52 4.72 6.89
3 2016 Summer CHN China 8.47 5.88 7.22 7.19
4 2016 Summer RUS Russia 6.19 5.56 5.56 5.76
5 2016 Summer DEU Germany 5.54 3.27 4.17 4.32
6 2016 Summer JPN Japan 3.91 2.61 5.83 4.21
7 2016 Summer FRA France 3.26 5.88 3.89 4.32
8 2016 Summer KOR South Korea 2.93 0.980 2.5 2.16
9 2016 Summer ITA Italy 2.61 3.92 2.22 2.88
10 2016 Summer AUS Australia 2.61 3.59 2.78 2.98
# ℹ 2,453 more rows
We can now adapt our previous table to look at the 10 greatest national performances based on medal percentage.
Show the code
<- readr::read_csv("data/Country_Flags.csv", show_col_type = FALSE) %>%
flag_db #Convert country names into 3-letter country codes
mutate(Code_raw = countrycode(sourcevar = Country, origin = "country.name", destination = "iso3c", warn = FALSE),
## There are a few cases that need to be specified manually
Code = case_when(Country == "Soviet Union" ~ "SUN",
== "East Germany" ~ "DDR",
Country == "Yugoslavia" ~ "YUG",
Country TRUE ~ Code_raw)) %>%
select(Code, flag_URL = ImageURL)
<- readr::read_csv("data/games_names.csv", show_col_type = FALSE)
games_names
<- medals_wide_prop |>
plot_data arrange(desc(perc_Gold), desc(perc_Silver), desc(perc_Bronze)) |>
slice(1:10) |>
mutate(rank = 1:n()) |>
left_join(games_names, by = "Year") |>
left_join(flag_db, by = c("ISO3c" = "Code")) |>
select(rank, Game, flag_URL, country, perc_Gold, perc_Silver, perc_Bronze, perc_Total)
<- col_numeric(c("grey99", "#fcc861"), domain = c(0, max(plot_data$perc_Gold)), alpha = 0.75)
perc_palette_Gold <- col_numeric(c("grey99", "#e5e5e5"), domain = c(0, max(plot_data$perc_Silver)), alpha = 0.75)
perc_palette_Silver <- col_numeric(c("grey99", "#dcb386"), domain = c(0, max(plot_data$perc_Bronze)), alpha = 0.75)
perc_palette_Bronze
|>
plot_data gt() |>
cols_label(rank = "",
Game = "Games",
country = "Country",
perc_Gold = "Gold (%)",
perc_Silver = "Silver (%)",
perc_Bronze = "Bronze (%)",
perc_Total = "Total (%)") %>%
tab_header(title = md("Olympic medal tally (%)"),
subtitle = "Best ever national performances (1948 - 2016)") %>%
tab_source_note(source_note = "Data: www.sports-reference.com") %>%
tab_style(
locations = cells_column_labels(columns = everything()),
style = list(
cell_borders(sides = "bottom", weight = px(3)),
cell_text(weight = "bold")
)%>%
) tab_style(
locations = cells_title(groups = "title"),
style = list(
cell_text(weight = "bold", size = 24)
)%>%
) data_color(columns = c(perc_Gold),
fn = perc_palette_Gold) %>%
data_color(columns = c(perc_Silver),
fn = perc_palette_Silver) %>%
data_color(columns = c(perc_Bronze),
fn = perc_palette_Bronze) %>%
fmt_number(columns = c(perc_Gold, perc_Silver, perc_Bronze, perc_Total),
decimals = 2) %>%
opt_all_caps() %>%
opt_table_font(
font = list(
google_font("Chivo"),
default_fonts()
)%>%
) cols_width(rank ~ px(40),
~ px(150),
Game c(country) ~ px(125),
c(perc_Gold,
perc_Silver,
perc_Bronze,~ px(85)) %>%
perc_Total) tab_options(
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
data_row.padding = px(3),
source_notes.font.size = 12,
heading.align = "left") |>
::text_transform(
gt#Apply a function to a column
locations = cells_body(c(flag_URL)),
fn = function(x) {
#Return an image of set dimensions
web_image(
url = x,
height = 12
)
}%>%
) #Hide column header flag_URL and reduce width
cols_width(c(flag_URL) ~ px(30)) %>%
cols_label(flag_URL = "")
Olympic medal tally (%) | |||||||
Best ever national performances (1948 - 2016) | |||||||
Games | Country | Gold (%) | Silver (%) | Bronze (%) | Total (%) | ||
---|---|---|---|---|---|---|---|
1 | Moscow 1980 | Soviet Union | 39.22 | 33.82 | 20.63 | 30.90 | |
2 | Los Angeles 1984 | United States | 36.61 | 27.98 | 12.35 | 25.26 | |
3 | Rome 1960 | Soviet Union | 28.29 | 19.46 | 19.38 | 22.34 | |
4 | Helsinki 1952 | United States | 26.85 | 12.58 | 10.83 | 16.63 | |
5 | London 1948 | United States | 26.39 | 18.49 | 12.75 | 19.13 | |
6 | Mexico City 1968 | United States | 25.86 | 16.47 | 18.58 | 20.30 | |
7 | Munich 1972 | Soviet Union | 25.64 | 13.85 | 10.48 | 16.50 | |
8 | Montreal 1976 | Soviet Union | 24.75 | 20.60 | 16.20 | 20.39 | |
9 | Melbourne 1956 | Soviet Union | 24.18 | 18.95 | 19.88 | 20.99 | |
10 | Moscow 1980 | East Germany | 23.04 | 18.14 | 18.83 | 19.97 | |
Data: www.sports-reference.com |
Problem 2: The Problem of Politics (and Pandemics)
We start to see performances from older Olympics (e.g. Melbourne 1956) making the cut once we account for the smaller number of medals available in those earlier years. Unfortunately, looking at the top two national performances we see another problem in our quest to calculate Olympic success. 4 of the top 10 national performances in our table come from Olympics marred by boycotts. The impressive display of the Soviet Union in 1980 and USA in 1984 is a bit less impressive when you remember that their major sporting rivals boycotted these games. It’s hard to compare performances from boycotted games to those with full participation.
Olympics is never devoid of politics, but to make things easier let’s drill down on Olympics from Atlanta 1996 onwards. This avoids major issues of Olympic boycotts and it covers a period where country borders have been a bit more stable. Because our data only goes up to Rio 2016, we also avoid the Covid disrupted event of 2020/2021.
Show the code
<- readr::read_csv("data/Country_Flags.csv", show_col_type = FALSE) %>%
flag_db #Convert country names into 3-letter country codes
mutate(Code_raw = countrycode(sourcevar = Country, origin = "country.name", destination = "iso3c", warn = FALSE),
## There are a few cases that need to be specified manually
Code = case_when(Country == "Soviet Union" ~ "SUN",
== "East Germany" ~ "DDR",
Country == "Yugoslavia" ~ "YUG",
Country TRUE ~ Code_raw)) %>%
select(Code, flag_URL = ImageURL)
<- readr::read_csv("data/games_names.csv", show_col_type = FALSE)
games_names
<- medals_wide_prop |>
plot_data filter(Year >= 1996) |>
arrange(desc(perc_Gold), desc(perc_Silver), desc(perc_Bronze)) |>
slice(1:10) |>
mutate(rank = 1:n()) |>
left_join(games_names, by = "Year") |>
left_join(flag_db, by = c("ISO3c" = "Code")) |>
select(rank, Game, flag_URL, country, perc_Gold, perc_Silver, perc_Bronze, perc_Total)
<- col_numeric(c("grey99", "#fcc861"), domain = c(0, max(plot_data$perc_Gold)), alpha = 0.75)
perc_palette_Gold <- col_numeric(c("grey99", "#e5e5e5"), domain = c(0, max(plot_data$perc_Silver)), alpha = 0.75)
perc_palette_Silver <- col_numeric(c("grey99", "#dcb386"), domain = c(0, max(plot_data$perc_Bronze)), alpha = 0.75)
perc_palette_Bronze
|>
plot_data gt() |>
cols_label(rank = "",
Game = "Games",
country = "Country",
perc_Gold = "Gold (%)",
perc_Silver = "Silver (%)",
perc_Bronze = "Bronze (%)",
perc_Total = "Total (%)") %>%
tab_header(title = md("Olympic medal tally (%)"),
subtitle = "Best national performances (1996 - 2016)") %>%
tab_source_note(source_note = "Data: www.sports-reference.com") %>%
tab_style(
locations = cells_column_labels(columns = everything()),
style = list(
cell_borders(sides = "bottom", weight = px(3)),
cell_text(weight = "bold")
)%>%
) tab_style(
locations = cells_title(groups = "title"),
style = list(
cell_text(weight = "bold", size = 24)
)%>%
) data_color(columns = c(perc_Gold),
fn = perc_palette_Gold) %>%
data_color(columns = c(perc_Silver),
fn = perc_palette_Silver) %>%
data_color(columns = c(perc_Bronze),
fn = perc_palette_Bronze) %>%
fmt_number(columns = c(perc_Gold, perc_Silver, perc_Bronze, perc_Total),
decimals = 2) %>%
opt_all_caps() %>%
opt_table_font(
font = list(
google_font("Chivo"),
default_fonts()
)%>%
) cols_width(rank ~ px(40),
c(country, Game) ~ px(125),
c(perc_Gold,
perc_Silver,
perc_Bronze,~ px(85)) %>%
perc_Total) tab_options(
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
data_row.padding = px(3),
source_notes.font.size = 12,
heading.align = "left") |>
::text_transform(
gt#Apply a function to a column
locations = cells_body(c(flag_URL)),
fn = function(x) {
#Return an image of set dimensions
web_image(
url = x,
height = 12
)
}%>%
) #Hide column header flag_URL and reduce width
cols_width(c(flag_URL) ~ px(30)) %>%
cols_label(flag_URL = "")
Olympic medal tally (%) | |||||||
Best national performances (1996 - 2016) | |||||||
Games | Country | Gold (%) | Silver (%) | Bronze (%) | Total (%) | ||
---|---|---|---|---|---|---|---|
1 | Beijing 2008 | China | 16.89 | 6.95 | 7.93 | 10.45 | |
2 | Atlanta 1996 | United States | 16.24 | 11.72 | 8.42 | 12.01 | |
3 | London 2012 | United States | 15.23 | 9.21 | 8.15 | 10.71 | |
4 | Rio 2016 | United States | 14.98 | 12.09 | 10.56 | 12.44 | |
5 | London 2012 | China | 12.58 | 8.88 | 6.46 | 9.15 | |
6 | Sydney 2000 | United States | 12.04 | 8.00 | 9.51 | 9.84 | |
7 | Athens 2004 | United States | 11.96 | 13.00 | 7.98 | 10.90 | |
8 | Beijing 2008 | United States | 11.92 | 12.91 | 9.92 | 11.49 | |
9 | Sydney 2000 | Russia | 10.70 | 9.33 | 8.90 | 9.62 | |
10 | Athens 2004 | China | 10.63 | 5.67 | 4.29 | 6.80 | |
Data: www.sports-reference.com |
Problem 3: The Problem of Money
We now see the top national performances from the past 20 years (6 Olympics) with the issue of medal inflation removed and political boycotts avoided. China and the US dominate the standings, with 9 of the top 10 performances on record. But is this really surprising considering these are the world’s two largest economies with countless amounts of money to throw at sport (and the prestige it can bring)? If we really want to measure sporting achievement, maybe we should remove the clear advantage available to wealthy countries. Instead of just showing medal percentage, we can instead show medal percentage per $GDP to account for economic opportunity.
Show the code
## Load GDP data
## From World Bank https://databank.worldbank.org/source/world-development-indicators
<- readr::read_csv("./data/GDP_data.csv", skip = 4, show_col_types = FALSE) |>
GDP ## Pivot longer to allow us to left join
::pivot_longer(cols = `1960`:`2023`, names_to = "year", values_to = "gdp") |>
tidyrmutate(year = as.numeric(year)) |>
::clean_names() |>
janitorselect(country_code:indicator_name, year, gdp) |>
## Remove countries with no GDP data
filter(!is.na(gdp))
<- medals_wide_prop |>
medals_gdp ## Join in GDP data
left_join(GDP, by = c("ISO3c" = "country_code", "Year" = "year")) |>
## Calculate medal % per billion$ GDP
mutate(gdp_bil = gdp/1e9,
perc_Gold_gdp = perc_Gold/gdp_bil,
perc_Silver_gdp = perc_Silver/gdp_bil,
perc_Bronze_gdp = perc_Bronze/gdp_bil) |>
select(Year, Season, ISO3c, country, perc_Gold_gdp:perc_Bronze_gdp, everything())
medals_gdp
# A tibble: 2,463 × 14
Year Season ISO3c country perc_Gold_gdp perc_Silver_gdp perc_Bronze_gdp
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 2016 Summer USA United Stat… 0.000797 0.000643 0.000561
2 2016 Summer GBR Great Brita… 0.00327 0.00280 0.00176
3 2016 Summer CHN China 0.000754 0.000524 0.000643
4 2016 Summer RUS Russia 0.00485 0.00435 0.00435
5 2016 Summer DEU Germany 0.00160 0.000942 0.00120
6 2016 Summer JPN Japan 0.000781 0.000522 0.00117
7 2016 Summer FRA France 0.00132 0.00238 0.00157
8 2016 Summer KOR South Korea 0.00195 0.000654 0.00167
9 2016 Summer ITA Italy 0.00139 0.00209 0.00118
10 2016 Summer AUS Australia 0.00216 0.00298 0.00230
# ℹ 2,453 more rows
# ℹ 7 more variables: perc_Gold <dbl>, perc_Silver <dbl>, perc_Bronze <dbl>,
# perc_Total <dbl>, indicator_name <chr>, gdp <dbl>, gdp_bil <dbl>
Show the code
<- readr::read_csv("data/Country_Flags.csv", show_col_type = FALSE) %>%
flag_db #Convert country names into 3-letter country codes
mutate(Code_raw = countrycode(sourcevar = Country, origin = "country.name", destination = "iso3c", warn = FALSE),
## There are a few cases that need to be specified manually
Code = case_when(Country == "Soviet Union" ~ "SUN",
== "East Germany" ~ "DDR",
Country == "Yugoslavia" ~ "YUG",
Country TRUE ~ Code_raw)) %>%
select(Code, flag_URL = ImageURL)
<- readr::read_csv("data/games_names.csv", show_col_type = FALSE)
games_names
<- medals_gdp |>
plot_data left_join(medals_wide |> select(Year, ISO3c, total_medals),
by = c("Year", "ISO3c")) |>
filter(Year >= 1996 & total_medals >= 5) |>
arrange(desc(perc_Gold_gdp), desc(perc_Silver_gdp), desc(perc_Bronze_gdp)) |>
mutate(rank = 1:n()) |>
filter(rank <= 10 | (Year == 2008 & ISO3c == "CHN") | (Year == 1996 & ISO3c == "USA")) |>
left_join(games_names, by = "Year") |>
left_join(flag_db, by = c("ISO3c" = "Code")) |>
select(rank, Game, flag_URL, country, perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp, gdp_bil)
## Create dynamic palettes to colour Gold, Silver, and Bronze columns
<- col_numeric(c("grey99", "#fcc861"), domain = c(0, max(plot_data$perc_Gold_gdp)), alpha = 0.75)
perc_palette_Gold <- col_numeric(c("grey99", "#e5e5e5"), domain = c(0, max(plot_data$perc_Silver_gdp)), alpha = 0.75)
perc_palette_Silver <- col_numeric(c("grey99", "#dcb386"), domain = c(0, max(plot_data$perc_Bronze_gdp)), alpha = 0.75)
perc_palette_Bronze
|>
plot_data gt() |>
cols_label(rank = "",
Game = "Games",
country = "Country",
perc_Gold_gdp = md("Gold<br>(%/$GDP)"),
perc_Silver_gdp = md("Silver<br>(%/$GDP)"),
perc_Bronze_gdp = md("Bronze<br>(%/$GDP)"),
gdp_bil = md("GDP<br>(Billion $USD)")) %>%
tab_header(title = md("Olympic performance (% relative to GDP)"),
subtitle = md("Best national performances (1996 - 2016)<br>Considering countries that won at least 5 medals")) %>%
tab_source_note(source_note = md("Olympic data: www.sports-reference.com<br>GDP data: World Bank Group")) %>%
tab_style(
locations = cells_column_labels(columns = everything()),
style = list(
cell_borders(sides = "bottom", weight = px(3)),
cell_text(weight = "bold")
)%>%
) tab_style(
locations = cells_title(groups = "title"),
style = list(
cell_text(weight = "bold", size = 24)
)%>%
) data_color(columns = c(perc_Gold_gdp),
fn = perc_palette_Gold) %>%
data_color(columns = c(perc_Silver_gdp),
fn = perc_palette_Silver) %>%
data_color(columns = c(perc_Bronze_gdp),
fn = perc_palette_Bronze) %>%
fmt_number(columns = c(perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp, gdp_bil),
decimals = 3) %>%
fmt_number(columns = c(gdp_bil), pattern = "${x}") %>%
opt_all_caps() %>%
opt_table_font(
font = list(
google_font("Chivo"),
default_fonts()
)%>%
) cols_width(rank ~ px(40),
c(country, Game) ~ px(125),
c(perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp) ~ px(80),
~ px(110)) %>%
gdp_bil tab_options(
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
data_row.padding = px(3),
source_notes.font.size = 12,
heading.align = "left") |>
::text_transform(
gt#Apply a function to a column
locations = cells_body(c(flag_URL)),
fn = function(x) {
#Return an image of set dimensions
web_image(
url = x,
height = 12
)
}%>%
) #Hide column header flag_URL and reduce width
cols_width(c(flag_URL) ~ px(30)) %>%
cols_label(flag_URL = "")
Olympic performance (% relative to GDP) | |||||||
Best national performances (1996 - 2016) Considering countries that won at least 5 medals |
|||||||
Games | Country | Gold (%/$GDP) |
Silver (%/$GDP) |
Bronze (%/$GDP) |
GDP (Billion $USD) |
||
---|---|---|---|---|---|---|---|
1 | Sydney 2000 | Ethiopia | 0.162 | 0.040 | 0.112 | $8.24 | |
2 | Rio 2016 | Jamaica | 0.139 | 0.070 | 0.039 | $14.08 | |
3 | Atlanta 1996 | Cuba | 0.133 | 0.117 | 0.108 | $25.02 | |
4 | Sydney 2000 | Bulgaria | 0.126 | 0.151 | 0.046 | $13.25 | |
5 | Beijing 2008 | Jamaica | 0.121 | 0.048 | 0.041 | $13.71 | |
6 | Sydney 2000 | Cuba | 0.120 | 0.120 | 0.070 | $30.57 | |
7 | Sydney 2000 | Romania | 0.099 | 0.054 | 0.074 | $37.25 | |
8 | Atlanta 1996 | Bulgaria | 0.090 | 0.209 | 0.137 | $12.29 | |
9 | London 2012 | Jamaica | 0.089 | 0.089 | 0.076 | $14.81 | |
10 | Sydney 2000 | Belarus | 0.079 | 0.079 | 0.265 | $12.74 | |
124 | Beijing 2008 | China | 0.004 | 0.002 | 0.002 | $4,594.34 | |
162 | Atlanta 1996 | United States | 0.002 | 0.001 | 0.001 | $8,073.12 | |
Olympic data: www.sports-reference.com GDP data: World Bank Group |
We finally get a picture of the real Olympic winners! Countries that have been able to achieve sporting success despite having orders of magnitude less money to invest into expensive sporting facilities or training regimes.
This table shows the top 10 countries using our new % medal/$GDP metric, but I’ve filtered it down to show only countries that won 5 or more medals. This hopefully focuses on countries with a quality Olympic team rather than just one individual star. You can see the full (unfiltered) table below.
The Conclusion: Always think twice about statistics
Headlines about record breaking medal tallies may seem catchy, but this is often just a consequence of the increasing number of sports and events being added to the Olympics (e.g. skateboarding, climbing). Even when countries do break new medal records (accounting for medal inflation!) a lot of these wins are only possible with hefty government investment which isn’t available in most places. Next time you’re looking at the Olympic medal tally, maybe scroll down a bit to spot those countries that are really punching above their weight.
Next week, we can see what our Olympic assessment method can tell us about the results of Paris 2024.
Appendix: Top ranked countries without filtering
This table shows the top ranking countries based on % medals/$GDP, but including countries with small medals hauls (<5). We see a few new appearances but Jamaica, Ethiopia, Cuba, and Bulgaria still feature in the top 10.
Show the code
<- readr::read_csv("data/Country_Flags.csv", show_col_type = FALSE) %>%
flag_db #Convert country names into 3-letter country codes
mutate(Code_raw = countrycode(sourcevar = Country, origin = "country.name", destination = "iso3c", warn = FALSE),
## There are a few cases that need to be specified manually
Code = case_when(Country == "Soviet Union" ~ "SUN",
== "East Germany" ~ "DDR",
Country == "Yugoslavia" ~ "YUG",
Country TRUE ~ Code_raw)) %>%
select(Code, flag_URL = ImageURL)
<- readr::read_csv("data/games_names.csv", show_col_type = FALSE)
games_names
<- medals_gdp |>
plot_data left_join(medals_wide |> select(Year, ISO3c, total_medals),
by = c("Year", "ISO3c")) |>
filter(Year >= 1996) |>
arrange(desc(perc_Gold_gdp), desc(perc_Silver_gdp), desc(perc_Bronze_gdp)) |>
mutate(rank = 1:n()) |>
filter(rank <= 10 | (Year == 2008 & ISO3c == "CHN") | (Year == 1996 & ISO3c == "USA")) |>
left_join(games_names, by = "Year") |>
left_join(flag_db, by = c("ISO3c" = "Code")) |>
select(rank, Game, flag_URL, country, perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp, gdp_bil)
## Create dynamic palettes to colour Gold, Silver, and Bronze columns
<- col_numeric(c("grey99", "#fcc861"), domain = c(0, max(plot_data$perc_Gold_gdp)), alpha = 0.75)
perc_palette_Gold <- col_numeric(c("grey99", "#e5e5e5"), domain = c(0, max(plot_data$perc_Silver_gdp)), alpha = 0.75)
perc_palette_Silver <- col_numeric(c("grey99", "#dcb386"), domain = c(0, max(plot_data$perc_Bronze_gdp)), alpha = 0.75)
perc_palette_Bronze
|>
plot_data gt() |>
cols_label(rank = "",
Game = "Games",
country = "Country",
perc_Gold_gdp = md("Gold<br>(%/$GDP)"),
perc_Silver_gdp = md("Silver<br>(%/$GDP)"),
perc_Bronze_gdp = md("Bronze<br>(%/$GDP)"),
gdp_bil = md("GDP<br>(Billion $USD)")) %>%
tab_header(title = md("Olympic performance (% relative to GDP)"),
subtitle = md("Best national performances (1996 - 2016)")) %>%
tab_source_note(source_note = md("Olympic data: www.sports-reference.com<br>GDP data: World Bank Group")) %>%
tab_style(
locations = cells_column_labels(columns = everything()),
style = list(
cell_borders(sides = "bottom", weight = px(3)),
cell_text(weight = "bold")
)%>%
) tab_style(
locations = cells_title(groups = "title"),
style = list(
cell_text(weight = "bold", size = 24)
)%>%
) data_color(columns = c(perc_Gold_gdp),
fn = perc_palette_Gold) %>%
data_color(columns = c(perc_Silver_gdp),
fn = perc_palette_Silver) %>%
data_color(columns = c(perc_Bronze_gdp),
fn = perc_palette_Bronze) %>%
fmt_number(columns = c(perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp, gdp_bil),
decimals = 3) %>%
fmt_number(columns = c(gdp_bil), pattern = "${x}") %>%
opt_all_caps() %>%
opt_table_font(
font = list(
google_font("Chivo"),
default_fonts()
)%>%
) cols_width(rank ~ px(40),
c(country, Game) ~ px(125),
c(perc_Gold_gdp, perc_Silver_gdp, perc_Bronze_gdp) ~ px(80),
~ px(110)) %>%
gdp_bil tab_options(
column_labels.border.top.width = px(3),
column_labels.border.top.color = "transparent",
table.border.top.color = "transparent",
table.border.bottom.color = "transparent",
data_row.padding = px(3),
source_notes.font.size = 12,
heading.align = "left") |>
::text_transform(
gt#Apply a function to a column
locations = cells_body(c(flag_URL)),
fn = function(x) {
#Return an image of set dimensions
web_image(
url = x,
height = 12
)
}%>%
) #Hide column header flag_URL and reduce width
cols_width(c(flag_URL) ~ px(30)) %>%
cols_label(flag_URL = "")
Olympic performance (% relative to GDP) | |||||||
Best national performances (1996 - 2016) | |||||||
Games | Country | Gold (%/$GDP) |
Silver (%/$GDP) |
Bronze (%/$GDP) |
GDP (Billion $USD) |
||
---|---|---|---|---|---|---|---|
1 | Atlanta 1996 | Burundi | 0.425 | 0.000 | 0.000 | $0.87 | |
2 | London 2012 | Grenada | 0.414 | 0.000 | 0.000 | $0.80 | |
3 | Atlanta 1996 | Armenia | 0.231 | 0.229 | 0.000 | $1.60 | |
4 | Sydney 2000 | Ethiopia | 0.162 | 0.040 | 0.112 | $8.24 | |
5 | Rio 2016 | Jamaica | 0.139 | 0.070 | 0.039 | $14.08 | |
6 | Atlanta 1996 | Cuba | 0.133 | 0.117 | 0.108 | $25.02 | |
7 | Athens 2004 | Georgia | 0.130 | 0.130 | 0.000 | $5.13 | |
8 | Sydney 2000 | Azerbaijan | 0.127 | 0.000 | 0.058 | $5.27 | |
9 | Sydney 2000 | Bulgaria | 0.126 | 0.151 | 0.046 | $13.25 | |
10 | Beijing 2008 | Jamaica | 0.121 | 0.048 | 0.041 | $13.71 | |
184 | Beijing 2008 | China | 0.004 | 0.002 | 0.002 | $4,594.34 | |
234 | Atlanta 1996 | United States | 0.002 | 0.001 | 0.001 | $8,073.12 | |
Olympic data: www.sports-reference.com GDP data: World Bank Group |