Skip to content

Commit

Permalink
Merge pull request #16 from seroanalytics/relative_dates
Browse files Browse the repository at this point in the history
Support relative dates and non-numeric pids
  • Loading branch information
hillalex authored Oct 15, 2024
2 parents badf860 + 988e67a commit d74abf2
Show file tree
Hide file tree
Showing 30 changed files with 2,765 additions and 256 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Package: epikinetics
Title: Antibody and Cycle Threshold Kinetics Modelling
Title: Biomarker Kinetics Modelling
Version: 0.0.0.9000
Authors@R: c(person("Timothy", "Russell", email = "[email protected]", role = c("aut")),
person("Alex", "Hill", email = "[email protected]", role = c("ctb", "cre")))
Description: What the package does (one paragraph).
person("Alex", "Hill", email = "[email protected]", role = c("aut", "cre")))
Description: Fit kinetic curves to biomarker data, using a Bayesian hierarchical model
License: GPL (>= 3)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Expand All @@ -25,6 +25,7 @@ Additional_repositories:
https://mc-stan.org/r-packages/
SystemRequirements: CmdStan (https://mc-stan.org/users/interfaces/cmdstan)
Suggests:
dplyr,
ggplot2,
knitr,
lubridate,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(add_exposure_data)
export(biokinetics)
export(biokinetics_priors)
export(convert_log_scale_inverse)
Expand Down
97 changes: 50 additions & 47 deletions R/biokinetics.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ biokinetics <- R6::R6Class(
preds_sd = NULL,
data = NULL,
covariate_formula = NULL,
time_type = NULL,
fitted = NULL,
stan_input_data = NULL,
model = NULL,
all_formula_vars = NULL,
design_matrix = NULL,
covariate_lookup_table = NULL,
pid_lookup = NULL,
check_fitted = function() {
if (is.null(private$fitted)) {
stop("Model has not been fitted yet. Call 'fit' before calling this function.")
Expand Down Expand Up @@ -71,19 +71,26 @@ biokinetics <- R6::R6Class(
private$design_matrix,
private$all_formula_vars)
},
build_pid_lookup = function() {
private$pid_lookup <- build_pid_lookup(private$data)
},
recover_covariate_names = function(dt) {
# Declare variables to suppress notes when compiling package
# https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
titre_type <- NULL

titre_types <- as.factor(unique(private$data$titre_type))

dt_titre_lookup <- data.table(
k = 1:private$data[, length(unique(titre_type))],
titre_type = private$data[, unique(titre_type)])
k = as.numeric(titre_types),
titre_type = levels(titre_types)
)

dt_out <- dt[dt_titre_lookup, on = "k"][, `:=`(k = NULL)]
if ("p" %in% colnames(dt)) {
dt_out <- dt_out[private$covariate_lookup_table, on = "p", nomatch = NULL][, `:=`(p = NULL)]
}
data.table::setnames(dt_out, "t", "time_since_last_exp", skip_absent=TRUE)
dt_out
},
summarise_pop_fit = function(time_range,
Expand Down Expand Up @@ -155,11 +162,11 @@ biokinetics <- R6::R6Class(
dt_out
},
prepare_stan_data = function() {
pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- t_since_min_date <- NULL
pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- NULL
stan_data <- list(
N = private$data[, .N],
N_events = private$data[, data.table::uniqueN(pid)],
id = private$data[, pid],
id = private$data[, private$pid_lookup[pid]],
value = private$data[, value],
censored = private$data[, censored],
titre_type = private$data[, titre_type_num],
Expand All @@ -172,12 +179,7 @@ biokinetics <- R6::R6Class(
cens_lo_idx = private$data[censored == -2, obs_id],
cens_hi_idx = private$data[censored == 1, obs_id])

if (private$time_type == "relative") {
stan_data$t <- private$data[, t_since_last_exp]
} else {
stan_data$t <- private$data[, t_since_min_date]
}

stan_data$t <- private$data[, t_since_last_exp]
stan_data$X <- private$design_matrix
stan_data$P <- ncol(private$design_matrix)

Expand Down Expand Up @@ -217,19 +219,15 @@ biokinetics <- R6::R6Class(
#' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
#' will be treated as categorical variables. Default ~0.
#' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
#' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
initialize = function(priors = biokinetics_priors(),
data = NULL,
file_path = NULL,
covariate_formula = ~0,
preds_sd = 0.25,
time_type = "relative") {
preds_sd = 0.25) {
validate_priors(priors)
private$priors <- priors
validate_numeric(preds_sd)
private$preds_sd <- preds_sd
validate_time_type(time_type)
private$time_type <- time_type
validate_formula(covariate_formula)
private$covariate_formula <- covariate_formula
private$all_formula_vars <- all.vars(covariate_formula)
Expand All @@ -252,14 +250,14 @@ biokinetics <- R6::R6Class(
logger::log_info("Preparing data for stan")
private$data <- convert_log_scale(private$data, "value")
private$data[, `:=`(titre_type_num = as.numeric(as.factor(titre_type)),
obs_id = seq_len(.N))]
if (time_type == "relative") {
private$data[, t_since_last_exp := as.integer(date - last_exp_date, units = "days")]
} else {
private$data[, t_since_min_date := as.integer(date - min(date), units = "days")]
obs_id = seq_len(.N),
t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
if (!("censored" %in% colnames(private$data))) {
private$data$censored <- 0
}
private$construct_design_matrix()
private$build_covariate_lookup_table()
private$build_pid_lookup()
private$prepare_stan_data()
logger::log_info("Retrieving compiled model")
private$model <- instantiate::stan_package_model(
Expand Down Expand Up @@ -338,8 +336,15 @@ biokinetics <- R6::R6Class(
logger::log_info("Extracting parameters")
dt_out <- private$extract_parameters(params, n_draws)

data.table::setcolorder(dt_out, c("n", "k", ".draw"))
data.table::setnames(dt_out, c("n", ".draw"), c("pid", "draw"))
data.table::setnames(dt_out, ".draw", "draw")

dt_out[, pid := names(private$pid_lookup)[n]]
if (is.numeric(private$data$pid)) {
dt_out[, pid := as.numeric(pid)]
}
dt_out$n <- NULL

data.table::setcolorder(dt_out, c("pid", "k", "draw"))

if (human_readable_covariates) {
logger::log_info("Recovering covariate names")
Expand All @@ -348,24 +353,21 @@ biokinetics <- R6::R6Class(
dt_out
},
#' @description Process the model results into a data table of titre values over time.
#' @return A data.table containing titre values at time points. If summarise = TRUE, columns are t, me, lo, hi,
#' titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are t, .draw
#' t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
#' @return A data.table containing titre values at time points. If summarise = TRUE, columns are time_since_last_exp,
#' me, lo, hi, titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are
#' time_since_last_exp, .draw, t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
#' titre_type and a column for each covariate in the hierarchical model. See the data vignette for details:
#' \code{vignette("data", package = "epikinetics")}
#' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
#' @param t_max Integer. Maximum number of time points to include.
#' @param summarise Boolean. Default TRUE. If TRUE, summarises over draws from posterior parameter distributions to
#' return 0.025, 0.5 and 0.975 quantiles, labelled lo, me and hi, respectively. If FALSE returns values for individual
#' draws from posterior parameter distributions.
#' @param n_draws Integer. Maximum number of samples to include. Default 2500.
simulate_population_trajectories = function(
time_type = "relative",
t_max = 150,
summarise = TRUE,
n_draws = 2500) {
private$check_fitted()
validate_time_type(time_type)
validate_numeric(t_max)
validate_logical(summarise)
validate_numeric(n_draws)
Expand All @@ -378,12 +380,6 @@ biokinetics <- R6::R6Class(

dt_out <- private$recover_covariate_names(dt_sum)

if (time_type == "absolute") {
logger::log_info("Converting to absolute time")
dt_out[, date := private$data[, unique(min(date))] + t,
by = c(private$all_formula_vars, "titre_type")]
}

dt_out <- dt_out[
, lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]

Expand Down Expand Up @@ -445,14 +441,14 @@ biokinetics <- R6::R6Class(
#' @description Simulate individual trajectories from the model. This is
#' computationally expensive and may take a while to run if n_draws is large.
#' @return A data.table. If summarise = TRUE columns are calendar_date, titre_type, me, lo, hi, time_shift.
#' If summarise = FALSE, columns are pid, draw, t, mu, titre_type, exposure_date, calendar_date, time_shift
#' and a column for each covariate in the hierarchical model. See the data vignette for details:
#' If summarise = FALSE, columns are pid, draw, time_since_last_exp, mu, titre_type, exposure_day, calendar_day, time_shift
#' and a column for each covariate in the regression model. See the data vignette for details:
#' \code{vignette("data", package = "epikinetics")}.
#' @param summarise Boolean. If TRUE, average the individual trajectories to get lo, me and
#' hi values for the population, disaggregated by titre type. If FALSE return the indidivudal trajectories.
#' Default TRUE.
#' @param n_draws Integer. Maximum number of samples to draw. Default 2500.
#' @param time_shift Integer. Number of days to adjust the exposure date by. Default 0.
#' @param time_shift Integer. Number of days to adjust the exposure day by. Default 0.
simulate_individual_trajectories = function(
summarise = TRUE,
n_draws = 2500,
Expand All @@ -470,52 +466,59 @@ biokinetics <- R6::R6Class(
# Calculating the maximum time each individual has data for after the
# exposure of interest
dt_max_dates <- private$data[
, .(t_max = max(t_since_last_exp)), by = .(pid)]
, .(t_max = max(t_since_last_exp)), by = "pid"]

# A very small number of individuals have bleeds on the same day or a few days
# after their recorded exposure dates, resulting in very short trajectories.
# Adding a 50 day buffer to any individuals with less than or equal to 50 days
# of observations after their focal exposure
dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = .(pid)]
dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = "pid"]

# Merging the parameter draws with the maximum time data.table
dt_params_ind <- merge(dt_params_ind, dt_max_dates, by = "pid")

dt_params_ind_trim <- dt_params_ind[, .SD[draw %in% 1:n_draws], by = pid]
# convert original pid to numeric pid
dt_params_ind[, pid := private$pid_lookup[pid]]

# Running the C++ code to simulate trajectories for each parameter sample
# for each individual
logger::log_info("Simulating individual trajectories")
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind_trim)
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)

dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu"))

# convert numeric pid to original pid
dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]
if (is.numeric(private$data$pid)) {
dt_params_ind_traj[, pid := as.numeric(pid)]
}

logger::log_info("Recovering covariate names")
dt_params_ind_traj <- private$recover_covariate_names(dt_params_ind_traj)

logger::log_info(paste("Calculating exposure dates. Adjusting exposures by", time_shift, "days"))
logger::log_info(paste("Calculating exposure days. Adjusting exposures by", time_shift, "days"))
dt_lookup <- private$data[, .(
exposure_date = min(last_exp_date) - time_shift),
exposure_day = min(last_exp_day) - time_shift),
by = c(private$all_formula_vars, "pid")]

dt_out <- merge(dt_params_ind_traj, dt_lookup, by = "pid")

dt_out[
, calendar_date := exposure_date + t,
, calendar_day := exposure_day + time_since_last_exp,
by = c(private$all_formula_vars, "pid", "titre_type")]

if (summarise) {
logger::log_info("Resampling")
dt_out <- dt_out[
!is.nan(mu), .(pop_mu_sum = mean(mosaic::resample(mu))),
by = c("calendar_date", "draw", "titre_type")]
by = c("calendar_day", "draw", "titre_type")]

logger::log_info("Summarising into population quantiles")
dt_out <- summarise_draws(
dt_out,
column_name = "pop_mu_sum",
by = c("calendar_date", "titre_type"))
by = c("calendar_day", "titre_type"))
}

dt_out[, time_shift := time_shift]
Expand Down
25 changes: 25 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,23 @@ convert_log_scale_inverse <- function(dt_in, vars_to_transform) {
dt_out
}

#' @title Combine titre data and infection data into biokinetics input format.
#'
#' @description The biokinetics class requires a single data table with titre readings
#' and last exposure times for all individuals. If you have exposure times and titre readings for the same
#' set of individuals in separate files, this function will combine them into a single data.table.
#' @return A data.table with all columns required by the biokinetics model.
#' @param dat_sero data.table containing titre values in the format required by biokinetics. See data vignette: \code{vignette("data", package = "epikinetics")}.
#' @param dat_inf data.table containing exposure days and person ids corresponding to those in dat_sero. By default the exposure days are expected in a column called 'day'.
#' @param exposure_column Default 'day'. The name of the column containing exposure days. These can be integers or dates.
#' @export
add_exposure_data <- function(dat_sero, dat_inf, exposure_column = 'day') {
validate_required_cols(dat_sero, required_cols = c("pid", "day", "titre_type", "value"))
validate_required_cols(dat_inf, required_cols = c("pid", exposure_column))
dat_inf[, "last_exp_day" := max(get(exposure_column)), by = "pid"]
merge(dat_sero, dat_inf[, c("pid", "last_exp_day")], by = "pid", allow.cartesian=TRUE)
}

summarise_draws <- function(dt_in, column_name, by = by) {
# Declare variables to suppress notes when compiling package
# https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
Expand Down Expand Up @@ -72,3 +89,11 @@ build_covariate_lookup_table <- function(data, design_matrix, all_formula_vars)
dt_out[, p_name := NULL]
dt_out
}

build_pid_lookup <- function(data) {
pids <- unique(data$pid)
ids <- seq_along(pids)
pid_lookup <- ids
names(pid_lookup) <- pids
pid_lookup
}
3 changes: 1 addition & 2 deletions R/validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ validate_formula_vars <- function(formula_vars, data) {
}
}

validate_required_cols <- function(dat) {
required_cols <- c("pid", "date", "last_exp_date", "titre_type", "value", "censored")
validate_required_cols <- function(dat, required_cols = c("pid", "day", "last_exp_day", "titre_type", "value")) {
missing_cols <- required_cols[!(required_cols %in% colnames(dat))]
if (length(missing_cols) > 0) {
stop(paste("Missing required columns:",
Expand Down
1 change: 0 additions & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
bin_stan <- file.path(libname, pkgname, "bin", "stan")
source_path <- file.path(libname, pkgname, "src", "stan")
fs::dir_copy(path = source_path, new_path = bin_stan, overwrite = TRUE)
message(fs::dir_ls(bin))
instantiate::stan_package_compile(
models = instantiate::stan_package_model_files(path = bin_stan),
cpp_options = list(stan_threads = TRUE),
Expand Down
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,27 @@ a few minutes.

## Testing

Most tests are run with
To run all tests locally:

```{r}
devtools::test()
```

Some tests are skipped on CI to avoid exorbitantly long build times, but this means
it is important to run all tests locally at least once before merging a pull request.

For snapshot testing of stan model outputs, we need the outputs to be exactly
reproducible. As well as setting a seed, this requires the machine environment
to be exactly the same, so we run these inside a Docker container, via a bash script:
to be exactly the same, so on CI we run these inside a Docker container, via a bash script:

```{shell}
./tests/snapshots/test-snapshots
```

This involves recompiling the model, so takes a while to run.

Note that

## Docker
To build a Docker image, run `docker/build`.
To push a new image to Dockerhub, `docker/push`. An image is built and pushed
Expand Down
2 changes: 1 addition & 1 deletion inst/ba2_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2022-01-31,2021-12-26,Delta,2560,1,Infection naive,BNT162b2,5
1,2022-01-31,2021-12-26,BA.1,1841.32228,0,Infection naive,BNT162b2,5
1,2022-01-31,2021-12-26,BA.2,786.8526775,0,Infection naive,BNT162b2,5
Expand Down
2 changes: 1 addition & 1 deletion inst/delta_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2021-03-10,2021-03-08,Ancestral,175.9349878,0,Infection naive,BNT162b2,2
1,2021-04-15,2021-03-08,Ancestral,607.57499,0,Infection naive,BNT162b2,2
1,2021-07-08,2021-03-08,Ancestral,179.0462942,0,Infection naive,BNT162b2,2
Expand Down
2 changes: 1 addition & 1 deletion inst/xbb_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2022-12-07,2022-11-16,BA.5,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
1,2022-12-07,2022-11-16,BQ.1.1,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
1,2022-12-07,2022-11-16,XBB,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
Expand Down
Loading

0 comments on commit d74abf2

Please sign in to comment.