00-init-data.R

library(tidyverse)
library(magrittr)
library(httr)
library(jsonlite)
library(here)
library(countrycode)
library(raster)
library(sf)
library(rnaturalearth)

h <- here::here

url <- "https://raw.githubusercontent.com/ecohealthalliance/amr-db/master/events-db.csv"
events <- GET(url, authenticate(Sys.getenv("GITHUB_USERNAME"), Sys.getenv("GITHUB_PAT")))
events <- read_csv(content(events, "text")) %>%
  mutate(start_year = as.integer(substr(start_date, 1, 4))) %>%
  filter(start_year >= 2006) %>%  # removes promed mentions prior to 2006
  rename(iso3c = study_iso3c)

#-----------------Country-wide data-----------------
# Mark first global emergence
events %<>%
  group_by(bacteria, drug) %>%
  mutate(first_emergence = start_date == min(start_date)) %>%
  ungroup()

# Summarize counts
events_by_country <- events %>%
  group_by(iso3c) %>%
  summarize(n_amr_events = n(), n_amr_first_events = sum(first_emergence)) %>%
  ungroup() 
#-----------------World Bank data-----------------
# 2006-2017
# Population and GDP
# SP.POP.TOTL = Population, total
# SM.POP.TOTL = International migrant stock, total (number of people born in a country other than that in which they live. It also includes refugees). United Nations Population Division, Trends in Total Migrant Stock: 2012 Revision.
# NY.GDP.MKTP.CD = GDP (current US$)
# SH.XPD.CHEX.GD.ZS = Current health expenditure (% of GDP) : World Health Organization Global Health Expenditure database

indicator = c("SP.POP.TOTL", 
              "SM.POP.TOTL",
              "NY.GDP.MKTP.CD",
              "SH.XPD.CHEX.GD.ZS")

wbd <- map_df(indicator, function(x){
  tmp <- fromJSON(paste0("http://api.worldbank.org/v2/country/all/indicator/", x, "?date=2006:2017&per_page=20000&format=json"))
  tmp[[2]] %>% 
    dplyr::select(value, date, countryiso3code) %>%
    mutate(param = tmp[[2]][[1]]$id) %>%
    as_tibble() %>%
    filter(countryiso3code != "")
})

wbd %<>%
  drop_na(value) %>%
  filter(countryiso3code != "AFE") %>% # duplicates and unclear what country tthis is
  spread(key = param, value = value) %>%
  mutate(SM.POP.TOTL = SM.POP.TOTL/SP.POP.TOTL) %>% #calc migrant % of population
  rename(population = SP.POP.TOTL,
         migrant_pop_per_capita = SM.POP.TOTL,
         gdp_dollars = NY.GDP.MKTP.CD,
         health_expend_perc = SH.XPD.CHEX.GD.ZS,
         iso3c = countryiso3code)

# average over date range
wbd %<>%
  group_by(iso3c) %>% 
  summarize(across(gdp_dollars:population, ~mean(., na.rm = TRUE))) %>%
  ungroup() %>%
  mutate(across(gdp_dollars:population, ~if_else(is.nan(.), NA_real_, .)))

#-----------------Observatory of Economic Complexity------------
# 2006-2017
# Visualization: https://oec.world/en/profile/hs92/antibiotics?yearSelector1=tradeYear4
# Values in billions


oec_exports <- map_df(2006:2017, function(y){
  fromJSON(paste0("https://oec.world/olap-proxy/data?cube=trade_i_baci_a_92&drilldowns=Year,Exporter Country&measures=Trade Value&parents=true&Year=", y, "&HS4=62941&properties=Exporter Country ISO 3"))$data %>% 
    dplyr::select(year = Year, iso3c = `ISO 3`, ab_export_dollars = `Trade Value`) 
})

oec_imports <-  map_df(2006:2017, function(y){
  fromJSON(paste0("https://oec.world/olap-proxy/data?cube=trade_i_baci_a_92&drilldowns=Year,Importer Country&measures=Trade Value&parents=true&Year=", y, "&HS4=62941&properties=Importer Country ISO 3"))$data %>% 
    dplyr::select(year = Year, iso3c = `ISO 3`, ab_import_dollars = `Trade Value`) 
})

oec <- full_join(oec_exports, oec_imports) %>% 
  drop_na(iso3c) %>%
  mutate(iso3c = toupper(iso3c)) %>%
  group_by(iso3c) %>%
  summarize(across(ab_export_dollars:ab_import_dollars, ~mean(., na.rm = TRUE))) %>% 
  ungroup()  %>% 
  mutate(across(ab_export_dollars:ab_import_dollars, ~if_else(is.nan(.), NA_real_, .)))

oec %<>% 
  mutate(ab_export_bin = ifelse(is.na(ab_export_dollars), 0, 1))

#-----------------Pubcrawler data-----------------
# Pubcrawler reporting effort index, as used in Hotspots II, aggregated by country.
# https://www.nature.com/articles/s41467-017-00923-8#MOESM1
# Values are unitless
pubcrawl_weights <- read_csv(h("data/pubcrawler_country.csv")) %>% # generated by Toph on 2/1/19
  mutate(country = tolower(country),
         country = recode(country, 
                          "belice" = "belize", 
                          "brasil" = "brazil",
                          "cen african rep" = "central african republic", 
                          "czech r" = "czech republic",
                          "dominican r" = "dominican republic", 
                          "england" = "united kingdom",
                          "mauretania" = "mauritania" , 
                          "moracco" = "morocco" ,
                          "morocco (includes western sahara)" = "western sahara",
                          "philippine" = "philippines", 
                          "tunesia" =  "tunisia" ),
         iso3c = countrycode(sourcevar = country,
                             origin = "country.name",
                             destination = "iso3c")) %>%
  drop_na(iso3c) %>% # federal republic of yugoslavia, netherland antilles
  filter(!(iso3c == "MRT" & pubs_sum==0)) %>%
  dplyr::select(-country, pubcrawl = pubs_sum) 

#-----------------GRITS ProMED-----------------
promed_weights <- read_csv(h("data/promed_mentions.csv")) %>%
  mutate(iso3c = countrycode(sourcevar = country_code,
                             origin = "iso2c",
                             destination = "iso3c")) %>% 
  drop_na(iso3c) %>% # netherland antilles, Serbia and Montenegro, Kosovo, Yugoslavia
  dplyr::select(-country_code, promed_mentions = mentions) 

#-----------------Language by country-----------------
# CIA World Factbook https://www.cia.gov/library/publications/the-world-factbook/fields/402.html
lang <- read_csv(h("data/cia-world-factbook-language.csv"), col_names = "info") %>%
  filter(!str_detect(info, "note"))  %>%
  mutate(desc = rep(c("country", "language"), nrow(.)/2)) 

lang <- lang %>% 
  filter(desc == "country") %>% 
  dplyr::select(country = info) %>%
  mutate(country = str_replace(country, "Eswatini", "Swaziland")) %>%
  bind_cols(lang %>% 
              filter(desc == "language") %>%
              dplyr::select(language = info)) %>%
  mutate(english_spoken = as.numeric(str_detect(language, "English")),
         iso3c = countrycode(sourcevar = country,
                             origin = "country.name",
                             destination = "iso3c")) %>%
  dplyr::select(iso3c, english_spoken) %>%
  drop_na(iso3c) %>% # Akrotiri, Dhekelia, Eswatini, European Union, Kosovo, Saint Martin, Virgin Islands, World
  distinct()
#-----------------Human Consumption data-----------------
# 2014 as stated (https://www.thelancet.com/action/showPdf?pii=S2542-5196%2818%2930186-4)
# From IQVIA MIDAS via ResistanceMap from the Center For Disease Dynamics, Economics & Policy, as provided in Collignon EA 2018 supp data via personal correspondence 
# Values are Total defined daily dose (DDD) per capita
# Defined Daily Dose (DDD): The assumed average maintenance dose per day for a drug used for its main indication in adults.
# there are three fields with human consumption data.  `CCDEP Usage Per Capita (Units of usage are on average 45% of DDD measured by ECDC)` has the most available.

human_consumption <- readxl::read_xlsx(h("data/Supplementary Table 1 Spreadsheet Data[2].xlsx")) %>%
  dplyr::select(`International Organization for Standardization (ISO)*`, 
                `CCDEP Usage Per Capita (Units of usage are on average 45% of DDD measured by ECDC)`) %>%
  rename(iso3c = `International Organization for Standardization (ISO)*`, 
         human_consumption_ddd =  `CCDEP Usage Per Capita (Units of usage are on average 45% of DDD measured by ECDC)`) %>%
  filter(iso3c != "na") %>%
  drop_na(iso3c, human_consumption_ddd)

#-----------------Livestock Consumption data-----------------
# 2017
# https://www.dropbox.com/s/6b6426nqiixl7dv/2017_Tonnes_PCU_2.csv?dl=0
# New data from T. Van Boeckel 

livestock_consumption <- read_csv(h("data/2017_Tonnes_PCU_2.csv")) %>% 
  drop_na(Observed_Tonnes) %>% 
  mutate(livestock_consumption_kg = Observed_Tonnes * 1000) %>% 
  dplyr::select(iso3c = ISO3, livestock_consumption_kg) 

livestock_pcu <- read_csv(h("data/2017_Tonnes_PCU_2.csv")) %>% 
  drop_na(Tot_PCU_2017) %>% 
  filter(Tot_PCU_2017 > 0) %>% 
  dplyr::select(iso3c = ISO3, livestock_pcu = Tot_PCU_2017) %>% 
  group_by(iso3c) %>% 
  summarize(livestock_pcu = mean(livestock_pcu)) %>%  # dealins with a dupe
  ungroup()

#-----------------Tourism data-----------------
# 2006-2017
# World Tourism Organization (2019), Compendium of Tourism Statistics dataset [Electronic], UNWTO, Madrid, data updated on 11/01/2019.
# Outbound tourism downloaded from https://www.e-unwto.org/doi/suppl/10.5555/unwtotfb0000290019952017201901
tour_outbound <- readxl::read_xlsx(h("data/0000290019952017201901.xlsx"), sheet = 1, skip = 5) %>% # Outbound tourism - Departures of visitors (overnight visitors -tourists- and same-day visitors -excursionists-)… Thousands
  dplyr::select(COUNTRY, `2006`:`2017`) %>%
  drop_na(COUNTRY)  %>% 
  pivot_longer(-COUNTRY, names_to = "year", values_to = "tourism_outbound")  %>% 
  filter(tourism_outbound != "..")
tour_inbound <- readxl::read_xlsx(h("data/0000270019952017201901.xlsx"), sheet = 1, skip = 5) %>% # Inbound tourism - Arrivals of non-resident visitors (overnight visitors -tourists- and same-day visitors -excur… Thousands
  dplyr::select(COUNTRY, `2006`:`2017`) %>%
  drop_na(COUNTRY)  %>% 
  pivot_longer(-COUNTRY, names_to = "year", values_to = "tourism_inbound")  %>% 
  filter(tourism_inbound != "..")

tourism <- full_join(tour_outbound, tour_inbound) %>%
  mutate_at(vars("tourism_outbound", "tourism_inbound"), ~as.numeric(.)) %>%
  mutate(COUNTRY = str_replace(COUNTRY, "Eswatini", "Swaziland")) %>%
  mutate(iso3c = countrycode(sourcevar = COUNTRY,
                             origin = "country.name",
                             destination = "iso3c")) %>%
  dplyr::select(-COUNTRY) %>%
  drop_na(iso3c)  %>% 
  group_by(iso3c) %>%
  summarize(across(tourism_outbound:tourism_inbound, ~mean(., na.rm = TRUE))) %>% 
  ungroup()  %>% 
  mutate(across(tourism_outbound:tourism_inbound, ~if_else(is.nan(.), NA_real_, .)))


#-----------------All countries-----------------
all_countries <- read_csv(h("data", "country-list.csv")) %>%
  mutate(iso3c = countrycode(sourcevar = country,
                             origin = "country.name",
                             destination = "iso3c"))  

#-----------------All data-----------------
amr <- all_countries %>%
  left_join(events_by_country) %>%
  left_join(pubcrawl_weights) %>%
  left_join(promed_weights) %>%
  left_join(wbd) %>%
  left_join(oec) %>%
  left_join(lang) %>%
  left_join(human_consumption) %>%
  left_join(livestock_consumption) %>%
  left_join(livestock_pcu) %>%
  left_join(tourism)

# make sure no event or consumption data lost
setdiff(events$iso3c, amr$iso3c)
setdiff(human_consumption$iso3c, amr$iso3c)
setdiff(livestock_consumption$iso3c, amr$iso3c) # islands

# check for dupes
amr %>% janitor::get_dupes(iso3c)

# Post process
amr %<>%
  mutate(ab_export_per_capita = ab_export_dollars/population,
         ab_import_per_capita = ab_import_dollars/population,
         livestock_consumption_kg_per_capita = livestock_consumption_kg/population,
         livestock_consumption_kg_per_pcu = livestock_consumption_kg/livestock_pcu,
         gdp_per_capita = gdp_dollars/population,
         tourism_outbound_per_capita = (tourism_outbound*1000)/population,
         tourism_inbound_per_capita = (tourism_inbound*1000)/population,
         pubcrawl_per_capita = pubcrawl/population,
         promed_mentions_per_capita = promed_mentions/population
  ) %>%
  dplyr::select(-ab_export_dollars, -ab_import_dollars, -tourism_outbound, -tourism_inbound, -livestock_consumption_kg, -country, -gdp_dollars, -pubcrawl, -promed_mentions)


assertthat::are_equal(nrow(janitor::get_dupes(amr, iso3c)), 0) 
write_csv(amr, h("data/country-level-amr.csv"))