In this document, I parse the email list of Ready for R participants, and try to map those with a country top level domain to the countries to produce a choropleth of where Ready for R participants are from.

knitr::opts_chunk$set(warning=FALSE, message=FALSE)
suppressMessages({library(readr)
library(janitor)
library(dplyr)
library(tldextract)
library(ggplot2)
library(forcats)
library(urltools)
  })


email_addresses <- read_csv("tinyletter_contacts_2020_07_4.csv") %>%
  clean_names()
## Parsed with column specification:
## cols(
##   `E-mail` = col_character(),
##   `Subscribe Date (GMT)` = col_datetime(format = ""),
##   Notes = col_logical()
## )

Extract the very top level domain from tlds like co.uk by using tidyr::separate_rows to separate on .. Now that I think about it, this is probably introducing lots of .co domains, so I need to fix this.

tld_names <- getTLD()

tld_suffix <- suffix_extract(email_addresses$e_mail) %>%
   mutate(tld=  gsub("[a-z]+\\.", "", suffix))

tlds <- tldextract(email_addresses$e_mail)  %>%
     mutate(tld=  gsub("[a-z]+\\.", "", tld))

As a first pass, I scrape the country level domains from https://www.countries-ofthe-world.com/TLD-list.html I tried multiple other CSV files from other courses, but this is the only source I could find with domain names mapped to countries.

library(rvest)
library(stringr)
country_page <- xml2::read_html("https://www.countries-ofthe-world.com/TLD-list.html")

countries <- bind_rows(html_table(country_page)) %>%
  janitor::clean_names()

countries <- countries %>%
  filter(grepl("\\.", domain)) %>%
  mutate(tld = str_remove(domain, "^\\."))

Map the email domains to countries using left_join(). Count by country.

tld_counts <- tlds %>%
  filter(tld != "com") %>%
  filter(tld != "edu") %>%
  filter(tld != "org") %>%
  left_join(y= countries, by=c("tld"="tld")) %>%
  count(country_territory, tld) %>%
  arrange(desc(n)) %>%
  mutate(country_territory = fct_reorder(country_territory, desc(n))) %>% ungroup() %>% mutate(country_territory = fct_rev(country_territory)) %>%
  tidyr::drop_na(country_territory)

ggplot(tld_counts) +
  aes(x=country_territory, y=n) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle=90, hjust = 1, vjust=0.5)) +
  ggtitle("Country Domains Represented in List") +
  coord_flip()

Use choroplethr::country_choropleth() to display country counts. Data is still pretty dirty, this is a first pass.

library(choroplethr)
library(choroplethrMaps)

tld_counts %>%
  mutate(country_territory = tolower(country_territory)) %>%
  select(region=country_territory, value=n) %>%
  country_choropleth(title="Ready for R Users by Country Domain",num_colors = 4)

Look at the top level domains by frequency count. As expected, .com and .edu dominate the list.

tlds %>%
  mutate(tld = factor(tld)) %>%
  count(tld) %>%
  arrange(desc(n)) %>%
  mutate(tld = fct_reorder(tld, desc(n))) %>%
  mutate(tld = fct_rev(tld)) %>%
  ggplot() +
  aes(x=tld, y=n) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title="Top Level Domains Sorted by Frequency")+ coord_flip()

tlds %>%
  filter(grepl("edu", tld)) %>%
  tidyr::separate(domain, into 
                  =c("email", "domain"), sep="@")  %>%
  tidyr::drop_na(domain) %>%
  count(domain) %>%
  arrange(desc(n)) %>%
  slice_head(n, n=20) %>%
   mutate(domain = fct_reorder(domain, desc(n))) %>%
  mutate(domain = fct_rev(domain)) %>%
  ggplot() +
  aes(x=domain, y=n) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title="Top 20 .edu domains") + 
  coord_flip()

How Many OHSU folks?

tlds %>%
  filter(grepl("ohsu", domain)) %>%
  nrow()
## [1] 33