seroanalytics · hillalex · Oct 15, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,6 +25,7 @@ Additional_repositories:
     https://mc-stan.org/r-packages/
 SystemRequirements: CmdStan (https://mc-stan.org/users/interfaces/cmdstan)
 Suggests:
+    dplyr,
     ggplot2,
     knitr,
     lubridate,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(add_exposure_data)
 export(biokinetics)
 export(biokinetics_priors)
 export(convert_log_scale_inverse)

diff --git a/R/biokinetics.R b/R/biokinetics.R
@@ -12,13 +12,13 @@ biokinetics <- R6::R6Class(
     preds_sd = NULL,
     data = NULL,
     covariate_formula = NULL,
-    time_type = NULL,
     fitted = NULL,
     stan_input_data = NULL,
     model = NULL,
     all_formula_vars = NULL,
     design_matrix = NULL,
     covariate_lookup_table = NULL,
+    pid_lookup = NULL,
     check_fitted = function() {
       if (is.null(private$fitted)) {
         stop("Model has not been fitted yet. Call 'fit' before calling this function.")
@@ -71,19 +71,26 @@ biokinetics <- R6::R6Class(
                                                                      private$design_matrix,
                                                                      private$all_formula_vars)
     },
+    build_pid_lookup = function() {
+      private$pid_lookup <- build_pid_lookup(private$data)
+    },
     recover_covariate_names = function(dt) {
       # Declare variables to suppress notes when compiling package
       # https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
       titre_type <- NULL
 
+      titre_types <- as.factor(unique(private$data$titre_type))
+
       dt_titre_lookup <- data.table(
-        k = 1:private$data[, length(unique(titre_type))],
-        titre_type = private$data[, unique(titre_type)])
+        k = as.numeric(titre_types),
+        titre_type = levels(titre_types)
+      )
 
       dt_out <- dt[dt_titre_lookup, on = "k"][, `:=`(k = NULL)]
       if ("p" %in% colnames(dt)) {
         dt_out <- dt_out[private$covariate_lookup_table, on = "p", nomatch = NULL][, `:=`(p = NULL)]
       }
+      data.table::setnames(dt_out, "t", "time_since_last_exp", skip_absent=TRUE)
       dt_out
     },
     summarise_pop_fit = function(time_range,
@@ -155,11 +162,11 @@ biokinetics <- R6::R6Class(
       dt_out
     },
     prepare_stan_data = function() {
-      pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- t_since_min_date <- NULL
+      pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- NULL
       stan_data <- list(
         N = private$data[, .N],
         N_events = private$data[, data.table::uniqueN(pid)],
-        id = private$data[, pid],
+        id = private$data[, private$pid_lookup[pid]],
         value = private$data[, value],
         censored = private$data[, censored],
         titre_type = private$data[, titre_type_num],
@@ -172,12 +179,7 @@ biokinetics <- R6::R6Class(
         cens_lo_idx = private$data[censored == -2, obs_id],
         cens_hi_idx = private$data[censored == 1, obs_id])
 
-      if (private$time_type == "relative") {
-        stan_data$t <- private$data[, t_since_last_exp]
-      } else {
-        stan_data$t <- private$data[, t_since_min_date]
-      }
-
+      stan_data$t <- private$data[, t_since_last_exp]
       stan_data$X <- private$design_matrix
       stan_data$P <- ncol(private$design_matrix)
 
@@ -217,19 +219,15 @@ biokinetics <- R6::R6Class(
     #' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
     #' will be treated as categorical variables. Default ~0.
     #' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
-    #' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
     initialize = function(priors = biokinetics_priors(),
                           data = NULL,
                           file_path = NULL,
                           covariate_formula = ~0,
-                          preds_sd = 0.25,
-                          time_type = "relative") {
+                          preds_sd = 0.25) {
       validate_priors(priors)
       private$priors <- priors
       validate_numeric(preds_sd)
       private$preds_sd <- preds_sd
-      validate_time_type(time_type)
-      private$time_type <- time_type
       validate_formula(covariate_formula)
       private$covariate_formula <- covariate_formula
       private$all_formula_vars <- all.vars(covariate_formula)
@@ -252,14 +250,14 @@ biokinetics <- R6::R6Class(
       logger::log_info("Preparing data for stan")
       private$data <- convert_log_scale(private$data, "value")
       private$data[, `:=`(titre_type_num = as.numeric(as.factor(titre_type)),
-                          obs_id = seq_len(.N))]
-      if (time_type == "relative") {
-        private$data[, t_since_last_exp := as.integer(date - last_exp_date, units = "days")]
-      } else {
-        private$data[, t_since_min_date := as.integer(date - min(date), units = "days")]
+                          obs_id = seq_len(.N),
+                          t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
+      if (!("censored" %in% colnames(private$data))) {
+        private$data$censored <- 0
       }
       private$construct_design_matrix()
       private$build_covariate_lookup_table()
+      private$build_pid_lookup()
       private$prepare_stan_data()
       logger::log_info("Retrieving compiled model")
       private$model <- instantiate::stan_package_model(
@@ -338,8 +336,15 @@ biokinetics <- R6::R6Class(
       logger::log_info("Extracting parameters")
       dt_out <- private$extract_parameters(params, n_draws)
 
-      data.table::setcolorder(dt_out, c("n", "k", ".draw"))
-      data.table::setnames(dt_out, c("n", ".draw"), c("pid", "draw"))
+      data.table::setnames(dt_out, ".draw", "draw")
+
+      dt_out[, pid := names(private$pid_lookup)[n]]
+      if (is.numeric(private$data$pid)) {
+        dt_out[, pid := as.numeric(pid)]
+      }
+      dt_out$n <- NULL
+
+      data.table::setcolorder(dt_out, c("pid", "k", "draw"))
 
       if (human_readable_covariates) {
         logger::log_info("Recovering covariate names")
@@ -348,24 +353,21 @@ biokinetics <- R6::R6Class(
       dt_out
     },
     #' @description Process the model results into a data table of titre values over time.
-    #' @return A data.table containing titre values at time points. If summarise = TRUE, columns are t, me, lo, hi,
-    #' titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are t, .draw
-    #' t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
+    #' @return A data.table containing titre values at time points. If summarise = TRUE, columns are time_since_last_exp,
+    #' me, lo, hi, titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are
+    #' time_since_last_exp, .draw, t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
     #' titre_type and a column for each covariate in the hierarchical model. See the data vignette for details:
     #' \code{vignette("data", package = "epikinetics")}
-    #' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
     #' @param t_max Integer. Maximum number of time points to include.
     #' @param summarise Boolean. Default TRUE. If TRUE, summarises over draws from posterior parameter distributions to
     #' return 0.025, 0.5 and 0.975 quantiles, labelled lo, me and hi, respectively. If FALSE returns values for individual
     #' draws from posterior parameter distributions.
     #' @param n_draws Integer. Maximum number of samples to include. Default 2500.
     simulate_population_trajectories = function(
-      time_type = "relative",
       t_max = 150,
       summarise = TRUE,
       n_draws = 2500) {
       private$check_fitted()
-      validate_time_type(time_type)
       validate_numeric(t_max)
       validate_logical(summarise)
       validate_numeric(n_draws)
@@ -378,12 +380,6 @@ biokinetics <- R6::R6Class(
 
       dt_out <- private$recover_covariate_names(dt_sum)
 
-      if (time_type == "absolute") {
-        logger::log_info("Converting to absolute time")
-        dt_out[, date := private$data[, unique(min(date))] + t,
-                 by = c(private$all_formula_vars, "titre_type")]
-      }
-
       dt_out <- dt_out[
         , lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]
 
@@ -445,14 +441,14 @@ biokinetics <- R6::R6Class(
     #' @description Simulate individual trajectories from the model. This is
     #' computationally expensive and may take a while to run if n_draws is large.
     #' @return A data.table. If summarise = TRUE columns are calendar_date, titre_type, me, lo, hi, time_shift.
-    #' If summarise = FALSE, columns are pid, draw, t, mu, titre_type, exposure_date, calendar_date, time_shift
-    #' and a column for each covariate in the hierarchical model. See the data vignette for details:
+    #' If summarise = FALSE, columns are pid, draw, time_since_last_exp, mu, titre_type, exposure_day, calendar_day, time_shift
+    #' and a column for each covariate in the regression model. See the data vignette for details:
     #' \code{vignette("data", package = "epikinetics")}.
     #' @param summarise Boolean. If TRUE, average the individual trajectories to get lo, me and
     #' hi values for the population, disaggregated by titre type. If FALSE return the indidivudal trajectories.
     #' Default TRUE.
     #' @param n_draws Integer. Maximum number of samples to draw. Default 2500.
-    #' @param time_shift Integer. Number of days to adjust the exposure date by. Default 0.
+    #' @param time_shift Integer. Number of days to adjust the exposure day by. Default 0.
     simulate_individual_trajectories = function(
       summarise = TRUE,
       n_draws = 2500,
@@ -470,52 +466,59 @@ biokinetics <- R6::R6Class(
       # Calculating the maximum time each individual has data for after the
       # exposure of interest
       dt_max_dates <- private$data[
-        , .(t_max = max(t_since_last_exp)), by = .(pid)]
+        , .(t_max = max(t_since_last_exp)), by = "pid"]
 
       # A very small number of individuals have bleeds on the same day or a few days
       # after their recorded exposure dates, resulting in very short trajectories.
       # Adding a 50 day buffer to any individuals with less than or equal to 50 days
       # of observations after their focal exposure
-      dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = .(pid)]
+      dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = "pid"]
 
       # Merging the parameter draws with the maximum time data.table
       dt_params_ind <- merge(dt_params_ind, dt_max_dates, by = "pid")
 
-      dt_params_ind_trim <- dt_params_ind[, .SD[draw %in% 1:n_draws], by = pid]
+      # convert original pid to numeric pid
+      dt_params_ind[, pid := private$pid_lookup[pid]]
 
       # Running the C++ code to simulate trajectories for each parameter sample
       # for each individual
       logger::log_info("Simulating individual trajectories")
-      dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind_trim)
+      dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)
 
       dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
         dt_params_ind_traj, vars_to_transform = "mu"))
 
+      # convert numeric pid to original pid
+      dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]
+      if (is.numeric(private$data$pid)) {
+        dt_params_ind_traj[, pid := as.numeric(pid)]
+      }
+
       logger::log_info("Recovering covariate names")
       dt_params_ind_traj <- private$recover_covariate_names(dt_params_ind_traj)
 
-      logger::log_info(paste("Calculating exposure dates. Adjusting exposures by", time_shift, "days"))
+      logger::log_info(paste("Calculating exposure days. Adjusting exposures by", time_shift, "days"))
       dt_lookup <- private$data[, .(
-        exposure_date = min(last_exp_date) - time_shift),
+        exposure_day = min(last_exp_day) - time_shift),
                                   by = c(private$all_formula_vars, "pid")]
 
       dt_out <- merge(dt_params_ind_traj, dt_lookup, by = "pid")
 
       dt_out[
-        , calendar_date := exposure_date + t,
+        , calendar_day := exposure_day + time_since_last_exp,
           by = c(private$all_formula_vars, "pid", "titre_type")]
 
       if (summarise) {
         logger::log_info("Resampling")
         dt_out <- dt_out[
           !is.nan(mu), .(pop_mu_sum = mean(mosaic::resample(mu))),
-          by = c("calendar_date", "draw", "titre_type")]
+          by = c("calendar_day", "draw", "titre_type")]
 
         logger::log_info("Summarising into population quantiles")
         dt_out <- summarise_draws(
           dt_out,
           column_name = "pop_mu_sum",
-          by = c("calendar_date", "titre_type"))
+          by = c("calendar_day", "titre_type"))
       }
 
       dt_out[, time_shift := time_shift]

diff --git a/R/utils.R b/R/utils.R
@@ -30,6 +30,23 @@ convert_log_scale_inverse <- function(dt_in, vars_to_transform) {
   dt_out
 }
 
+#' @title Combine titre data and infection data into biokinetics input format.
+#'
+#' @description The biokinetics class requires a single data table with titre readings
+#' and last exposure times for all individuals. If you have exposure times and titre readings for the same
+#' set of individuals in separate files, this function will combine them into a single data.table.
+#' @return A data.table with all columns required by the biokinetics model.
+#' @param dat_sero data.table containing titre values in the format required by biokinetics. See data vignette: \code{vignette("data", package = "epikinetics")}.
+#' @param dat_inf data.table containing exposure days and person ids corresponding to those in dat_sero. By default the exposure days are expected in a column called 'day'.
+#' @param exposure_column Default 'day'. The name of the column containing exposure days. These can be integers or dates.
+#' @export
+add_exposure_data <- function(dat_sero, dat_inf, exposure_column = 'day') {
+  validate_required_cols(dat_sero, required_cols = c("pid", "day", "titre_type", "value"))
+  validate_required_cols(dat_inf, required_cols = c("pid", exposure_column))
+  dat_inf[, "last_exp_day" := max(get(exposure_column)), by  = "pid"]
+  merge(dat_sero, dat_inf[, c("pid", "last_exp_day")], by = "pid", allow.cartesian=TRUE)
+}
+
 summarise_draws <- function(dt_in, column_name, by = by) {
   # Declare variables to suppress notes when compiling package
   # https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
@@ -72,3 +89,11 @@ build_covariate_lookup_table <- function(data, design_matrix, all_formula_vars)
   dt_out[, p_name := NULL]
   dt_out
 }
+
+build_pid_lookup <- function(data) {
+  pids <- unique(data$pid)
+  ids <- seq_along(pids)
+  pid_lookup <- ids
+  names(pid_lookup) <- pids
+  pid_lookup
+}
diff --git a/R/validation.R b/R/validation.R
@@ -30,8 +30,7 @@ validate_formula_vars <- function(formula_vars, data) {
   }
 }
 
-validate_required_cols <- function(dat) {
-  required_cols <- c("pid", "date", "last_exp_date", "titre_type", "value", "censored")
+validate_required_cols <- function(dat, required_cols = c("pid", "day", "last_exp_day", "titre_type", "value")) {
   missing_cols <- required_cols[!(required_cols %in% colnames(dat))]
   if (length(missing_cols) > 0) {
     stop(paste("Missing required columns:",

diff --git a/R/zzz.R b/R/zzz.R
@@ -32,7 +32,6 @@
     bin_stan <- file.path(libname, pkgname, "bin", "stan")
     source_path <- file.path(libname, pkgname, "src", "stan")
     fs::dir_copy(path = source_path, new_path = bin_stan, overwrite = TRUE)
-    message(fs::dir_ls(bin))
     instantiate::stan_package_compile(
       models = instantiate::stan_package_model_files(path = bin_stan),
       cpp_options = list(stan_threads = TRUE),

diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ devtools::test()
 
 For snapshot testing of stan model outputs, we need the outputs to be exactly 
 reproducible. As well as setting a seed, this requires the machine environment 
-to be exactly the same, so we run these inside a Docker container, via a bash script:
+to be exactly the same, so on CI we run these inside a Docker container, via a bash script:
 
 ```{shell}
 ./tests/snapshots/test-snapshots

diff --git a/inst/ba2_full.rds b/inst/ba2_full.rds
@@ -1,4 +1,4 @@
-pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
+pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
 1,2022-01-31,2021-12-26,Delta,2560,1,Infection naive,BNT162b2,5
 1,2022-01-31,2021-12-26,BA.1,1841.32228,0,Infection naive,BNT162b2,5
 1,2022-01-31,2021-12-26,BA.2,786.8526775,0,Infection naive,BNT162b2,5

diff --git a/inst/delta_full.rds b/inst/delta_full.rds
@@ -1,4 +1,4 @@
-pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
+pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
 1,2021-03-10,2021-03-08,Ancestral,175.9349878,0,Infection naive,BNT162b2,2
 1,2021-04-15,2021-03-08,Ancestral,607.57499,0,Infection naive,BNT162b2,2
 1,2021-07-08,2021-03-08,Ancestral,179.0462942,0,Infection naive,BNT162b2,2

diff --git a/inst/xbb_full.rds b/inst/xbb_full.rds
@@ -1,4 +1,4 @@
-pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
+pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
 1,2022-12-07,2022-11-16,BA.5,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
 1,2022-12-07,2022-11-16,BQ.1.1,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
 1,2022-12-07,2022-11-16,XBB,2560,1,Previously infected (Omicron),BNT162b2+BA1,7

diff --git a/man/add_exposure_data.Rd b/man/add_exposure_data.Rd