Merge pull request #85 from ImperialCollegeLondon/version3

nature submission mirror with refactoring
ImperialCollegeLondon · Apr 24, 2020 · 865806f · 865806f
2 parents 742ddd0 + d41c601
commit 865806f
Show file tree

Hide file tree

Showing 12 changed files with 13,082 additions and 10,155 deletions.
diff --git a/base.r b/base.r
@@ -7,22 +7,8 @@ library(tidyr)
 library(EnvStats)
 library(optparse)
 
-countries <- c(
-  "Denmark",
-  "Italy",
-  "Germany",
-  "Spain",
-  "United_Kingdom",
-  "France",
-  "Norway",
-  "Belgium",
-  "Austria", 
-  "Sweden",
-  "Switzerland",
-  "Greece",
-  "Portugal",
-  "Netherlands"
-)
+source('utils/read-data.r')
+source('utils/process-covariates.r')
 
 # Commandline options and parsing
 parser <- OptionParser()
@@ -54,198 +40,44 @@ if(length(cmdoptions$args) == 0) {
 } else {
   StanModel = cmdoptions$args[1]
 }
- 
+
 print(sprintf("Running %s",StanModel))
 if(DEBUG) {
   print("Running in DEBUG mode")
 } else if (FULL) {
   print("Running in FULL mode")
 }
 
-## Reading all data
-d=readRDS('data/COVID-19-up-to-date.rds')
-
-## Ensure that output directories exist
-dir.create("results/", showWarnings = FALSE, recursive = TRUE)
-dir.create("figures/", showWarnings = FALSE, recursive = TRUE)
-dir.create("web/", showWarnings = FALSE, recursive = TRUE)
-
-## get IFR and population from same file
-ifr.by.country = read.csv("data/popt_ifr.csv")
-ifr.by.country$country = as.character(ifr.by.country[,2])
-ifr.by.country$country[ifr.by.country$country == "United Kingdom"] = "United_Kingdom"
-
-serial.interval = read.csv("data/serial_interval.csv")
-covariates = read.csv('data/interventions.csv', stringsAsFactors = FALSE)
-names_covariates = c('Schools + Universities','Self-isolating if ill', 'Public events', 'Lockdown', 'Social distancing encouraged')
-covariates <- covariates %>%
-  filter((Type %in% names_covariates))
-covariates <- covariates[,c(1,2,4)]
-covariates <- spread(covariates, Type, Date.effective)
-names(covariates) <- c('Country','lockdown', 'public_events', 'schools_universities','self_isolating_if_ill', 'social_distancing_encouraged')
-covariates <- covariates[c('Country','schools_universities', 'self_isolating_if_ill', 'public_events', 'lockdown', 'social_distancing_encouraged')]
-covariates$schools_universities <- as.Date(covariates$schools_universities, format = "%d.%m.%Y")
-covariates$lockdown <- as.Date(covariates$lockdown, format = "%d.%m.%Y")
-covariates$public_events <- as.Date(covariates$public_events, format = "%d.%m.%Y")
-covariates$self_isolating_if_ill <- as.Date(covariates$self_isolating_if_ill, format = "%d.%m.%Y")
-covariates$social_distancing_encouraged <- as.Date(covariates$social_distancing_encouraged, format = "%d.%m.%Y")
-## using covariates as dates we want
-covariates$schools_universities[covariates$schools_universities > covariates$lockdown] <- covariates$lockdown[covariates$schools_universities > covariates$lockdown]
-covariates$public_events[covariates$public_events > covariates$lockdown] <- covariates$lockdown[covariates$public_events > covariates$lockdown]
-covariates$social_distancing_encouraged[covariates$social_distancing_encouraged > covariates$lockdown] <- covariates$lockdown[covariates$social_distancing_encouraged > covariates$lockdown]
-covariates$self_isolating_if_ill[covariates$self_isolating_if_ill > covariates$lockdown] <- covariates$lockdown[covariates$self_isolating_if_ill > covariates$lockdown]
-
-forecast = 0
-
-N2 = 120 # increase if you need more forecast
-
-dates = list()
-reported_cases = list()
-# Pads serial interval with 0 if N2 is greater than the length of the serial
-# interval array
-if (N2 > length(serial.interval$fit)) {
-  pad_serial.interval <- data.frame(
-    "X"=(length(serial.interval$fit)+1):N2,
-    "fit"=rep(0.0, max(N2-length(serial.interval$fit), 0 ))
-  )
-  serial.interval = rbind(serial.interval, pad_serial.interval)
-}
-stan_data = list(M=length(countries),N=NULL,covariate1=NULL,covariate2=NULL,covariate3=NULL,covariate4=NULL,covariate5=NULL,covariate6=NULL,deaths=NULL,f=NULL,
-                 N0=6,cases=NULL,SI=serial.interval$fit[1:N2],
-                 EpidemicStart = NULL, pop = NULL) # N0 = 6 to make it consistent with Rayleigh
-deaths_by_country = list()
-
-# various distributions required for modeling
-mean1 = 5.1; cv1 = 0.86; # infection to onset
-mean2 = 18.8; cv2 = 0.45 # onset to death
-x1 = rgammaAlt(1e7,mean1,cv1) # infection-to-onset distribution
-x2 = rgammaAlt(1e7,mean2,cv2) # onset-to-death distribution
-
-ecdf.saved = ecdf(x1+x2)
-
-for(Country in countries) {
-  IFR=ifr.by.country$ifr[ifr.by.country$country == Country]
-
-  covariates1 <- covariates[covariates$Country == Country, c(2,3,4,5,6)]
-
-  d1_pop = ifr.by.country[ifr.by.country$country==Country,]
-  d1=d[d$Countries.and.territories==Country,c(1,5,6,7)]
-  d1$date = as.Date(d1$DateRep,format='%d/%m/%Y')
-  d1$t = decimal_date(d1$date) 
-  d1=d1[order(d1$t),]
-
-  date_min <- dmy('31/12/2019') 
-  if (as.Date(d1$DateRep[1], format='%d/%m/%Y') > as.Date(date_min, format='%d/%m/%Y')){
-    print(paste(Country,'In padding'))
-    pad_days <- as.Date(d1$DateRep[1], format='%d/%m/%Y') - date_min
-    pad_dates <- date_min + days(1:pad_days[[1]]-1)
-    padded_data <- data.frame("Countries.and.territories" = rep(Country, pad_days),
-                              "DateRep" = format(pad_dates, '%d/%m/%Y'),
-                              "t" = decimal_date(as.Date(pad_dates,format='%d/%m/%Y')),
-                              "date" = as.Date(pad_dates,format='%d/%m/%Y'),
-                              "Cases" = as.integer(rep(0, pad_days)),
-                              "Deaths" = as.integer(rep(0, pad_days)),
-                              stringsAsFactors=F)
-
-    d1 <- bind_rows(padded_data, d1)
-  }
-  index = which(d1$Cases>0)[1]
-  index1 = which(cumsum(d1$Deaths)>=10)[1] # also 5
-  index2 = index1-30
-
-  print(sprintf("First non-zero cases is on day %d, and 30 days before 10 deaths is day %d",index,index2))
-  d1=d1[index2:nrow(d1),]
-  stan_data$EpidemicStart = c(stan_data$EpidemicStart,index1+1-index2)
-  stan_data$pop = c(stan_data$pop, d1_pop$popt)
-
-
-  for (ii in 1:ncol(covariates1)) {
-    covariate = names(covariates1)[ii]
-    d1[covariate] <- (as.Date(d1$DateRep, format='%d/%m/%Y') >= as.Date(covariates1[1,covariate]))*1  # should this be > or >=?
-  }
-
-  dates[[Country]] = d1$date
-  # hazard estimation
-  N = length(d1$Cases)
-  print(sprintf("%s has %d days of data",Country,N))
-  forecast = N2 - N
-  if(forecast < 0) {
-    print(sprintf("%s: %d", Country, N))
-    print("ERROR!!!! increasing N2")
-    N2 = N
-    forecast = N2 - N
-  }
-
-  # IFR is the overall probability of dying given infection
-  convolution = function(u) (IFR * ecdf.saved(u))
+# Read which countires to use
+countries <- read.csv('data/regions.csv', stringsAsFactors = FALSE)
+# Read deaths data for regions
+d <- read_obs_data(countries)
+# Read ifr 
+ifr.by.country <- read_ifr_data()
 
-  f = rep(0,N2) # f is the probability of dying on day i given infection
-  f[1] = (convolution(1.5) - convolution(0))
-  for(i in 2:N2) {
-    f[i] = (convolution(i+.5) - convolution(i-.5)) 
-  }
-
-  reported_cases[[Country]] = as.vector(as.numeric(d1$Cases))
-  deaths=c(as.vector(as.numeric(d1$Deaths)),rep(-1,forecast))
-  cases=c(as.vector(as.numeric(d1$Cases)),rep(-1,forecast))
-  deaths_by_country[[Country]] = as.vector(as.numeric(d1$Deaths))
-  covariates2 <- as.data.frame(d1[, colnames(covariates1)])
-  # x=1:(N+forecast)
-  covariates2[N:(N+forecast),] <- covariates2[N,]
-
-  ## append data
-  stan_data$N = c(stan_data$N,N)
-  # stan_data$x = cbind(stan_data$x,x)
-  stan_data$covariate1 = cbind(stan_data$covariate1,covariates2[,1])
-  stan_data$covariate2 = cbind(stan_data$covariate2,covariates2[,2])
-  stan_data$covariate3 = cbind(stan_data$covariate3,covariates2[,3])
-  stan_data$covariate4 = cbind(stan_data$covariate4,covariates2[,4])
-  stan_data$covariate5 = cbind(stan_data$covariate5,covariates2[,4])
-  stan_data$covariate6 = cbind(stan_data$covariate6,covariates2[,5])
-  stan_data$f = cbind(stan_data$f,f)
-  stan_data$deaths = cbind(stan_data$deaths,deaths)
-  stan_data$cases = cbind(stan_data$cases,cases)
-
-  stan_data$N2=N2
-  stan_data$x=1:N2
-  if(length(stan_data$N) == 1) {
-    stan_data$N = as.array(stan_data$N)
-  }
-}
+# Read interventions
+interventions <- read_interventions(countries)
 
-# create the `any intervention` covariate
-stan_data$covariate4 = 1*as.data.frame((stan_data$covariate1+
-                                          stan_data$covariate2+
-                                          stan_data$covariate3+
-                                          stan_data$covariate5+
-                                          stan_data$covariate6) >= 1)
+N2 <- 100 # increase if you need more forecast
 
-if(DEBUG) {
-  for(i in 1:length(countries)) {
-    write.csv(
-      data.frame(date=dates[[i]],
-                 `school closure`=stan_data$covariate1[1:stan_data$N[i],i],
-                 `self isolating if ill`=stan_data$covariate2[1:stan_data$N[i],i],
-                 `public events`=stan_data$covariate3[1:stan_data$N[i],i],
-                 `government makes any intervention`=stan_data$covariate4[1:stan_data$N[i],i],
-                 `lockdown`=stan_data$covariate5[1:stan_data$N[i],i],
-                 `social distancing encouraged`=stan_data$covariate6[1:stan_data$N[i],i]),
-      file=sprintf("results/%s-check-dates.csv",countries[i]),row.names=F)
-  }
-}
+processed_data <- process_covariates(countries = countries, interventions = interventions, 
+                                     d = d , ifr.by.country = ifr.by.country, N2 = N2)
+stan_data = processed_data$stan_data
+dates = processed_data$dates
+deaths_by_country = processed_data$deaths_by_country
+reported_cases = processed_data$reported_cases
 
 options(mc.cores = parallel::detectCores())
 rstan_options(auto_write = TRUE)
 m = stan_model(paste0('stan-models/',StanModel,'.stan'))
 
-
 if(DEBUG) {
   fit = sampling(m,data=stan_data,iter=40,warmup=20,chains=2)
 } else if (FULL) {
-  fit = sampling(m,data=stan_data,iter=4000,warmup=2000,chains=4,thin=4,control = list(adapt_delta = 0.95, max_treedepth = 10))
+  fit = sampling(m,data=stan_data,iter=1800,warmup=1000,chains=5,thin=1,control = list(adapt_delta = 0.95, max_treedepth = 15))
 } else { 
-  fit = sampling(m,data=stan_data,iter=200,warmup=100,chains=4,thin=4,control = list(adapt_delta = 0.95, max_treedepth = 10))
-}  
+  fit = sampling(m,data=stan_data,iter=1000,warmup=500,chains=4,thin=1,control = list(adapt_delta = 0.95, max_treedepth = 10))
+}   
 
 out = rstan::extract(fit)
 prediction = out$prediction
@@ -257,9 +89,15 @@ if(JOBID == "")
   JOBID = as.character(abs(round(rnorm(1) * 1000000)))
 print(sprintf("Jobid = %s",JOBID))
 
+countries <- countries$Regions
 save.image(paste0('results/',StanModel,'-',JOBID,'.Rdata'))
+save(fit,prediction,dates,reported_cases,deaths_by_country,countries,estimated.deaths,estimated.deaths.cf,out,file=paste0('results/',StanModel,'-',JOBID,'-stanfit.Rdata'))
 
-save(fit,prediction,dates,reported_cases,deaths_by_country,countries,estimated.deaths,estimated.deaths.cf,out,covariates,file=paste0('results/',StanModel,'-',JOBID,'-stanfit.Rdata'))
+## Ensure that output directories exist
+dir.create("results/", showWarnings = FALSE, recursive = TRUE)
+dir.create("figures/", showWarnings = FALSE, recursive = TRUE)
+dir.create("web/", showWarnings = FALSE, recursive = TRUE)
+dir.create("web/data", showWarnings = FALSE, recursive = TRUE)
 
 library(bayesplot)
 filename <- paste0(StanModel,'-',JOBID)
@@ -298,6 +136,7 @@ if(make_table_error != 0){
   stop(sprintf("Generation of alpha covar table failed! Code: %d", make_table_error))
 }
 
+
 verify_result_error <- system(paste0("Rscript web-verify-output.r ", filename,'.Rdata'),intern=FALSE)
 if(verify_result_error != 0){
   stop(sprintf("Verification of web output failed! Code: %d", verify_result_error))

diff --git a/covariate-size-effects.r b/covariate-size-effects.r
@@ -15,7 +15,7 @@ plot_labels <- c("School Closure",
                  "Self Isolation",
                  "Public Events",
                  "First Intervention",
-                 "Lockdown", 'Social distancing')
+                 "Lockdown", 'Social distancing \n encouraged')
 colnames(alpha) = plot_labels
 first.intervention = alpha[,c(1,2,3,5,6)] + alpha[,4]
 data1 = mcmc_intervals_data(first.intervention,prob=.95,transformation=function(x) 1-exp(-x),point_est="mean")