Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support relative dates and non-numeric pids #16

Merged
merged 11 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Additional_repositories:
https://mc-stan.org/r-packages/
SystemRequirements: CmdStan (https://mc-stan.org/users/interfaces/cmdstan)
Suggests:
dplyr,
ggplot2,
knitr,
lubridate,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(add_exposure_data)
export(biokinetics)
export(biokinetics_priors)
export(convert_log_scale_inverse)
Expand Down
97 changes: 50 additions & 47 deletions R/biokinetics.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ biokinetics <- R6::R6Class(
preds_sd = NULL,
data = NULL,
covariate_formula = NULL,
time_type = NULL,
fitted = NULL,
stan_input_data = NULL,
model = NULL,
all_formula_vars = NULL,
design_matrix = NULL,
covariate_lookup_table = NULL,
pid_lookup = NULL,
check_fitted = function() {
if (is.null(private$fitted)) {
stop("Model has not been fitted yet. Call 'fit' before calling this function.")
Expand Down Expand Up @@ -71,19 +71,26 @@ biokinetics <- R6::R6Class(
private$design_matrix,
private$all_formula_vars)
},
build_pid_lookup = function() {
private$pid_lookup <- build_pid_lookup(private$data)
},
recover_covariate_names = function(dt) {
# Declare variables to suppress notes when compiling package
# https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
titre_type <- NULL

titre_types <- as.factor(unique(private$data$titre_type))

dt_titre_lookup <- data.table(
k = 1:private$data[, length(unique(titre_type))],
titre_type = private$data[, unique(titre_type)])
k = as.numeric(titre_types),
titre_type = levels(titre_types)
)

dt_out <- dt[dt_titre_lookup, on = "k"][, `:=`(k = NULL)]
if ("p" %in% colnames(dt)) {
dt_out <- dt_out[private$covariate_lookup_table, on = "p", nomatch = NULL][, `:=`(p = NULL)]
}
data.table::setnames(dt_out, "t", "time_since_last_exp", skip_absent=TRUE)
dt_out
},
summarise_pop_fit = function(time_range,
Expand Down Expand Up @@ -155,11 +162,11 @@ biokinetics <- R6::R6Class(
dt_out
},
prepare_stan_data = function() {
pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- t_since_min_date <- NULL
pid <- value <- censored <- titre_type_num <- titre_type <- obs_id <- t_since_last_exp <- NULL
stan_data <- list(
N = private$data[, .N],
N_events = private$data[, data.table::uniqueN(pid)],
id = private$data[, pid],
id = private$data[, private$pid_lookup[pid]],
value = private$data[, value],
censored = private$data[, censored],
titre_type = private$data[, titre_type_num],
Expand All @@ -172,12 +179,7 @@ biokinetics <- R6::R6Class(
cens_lo_idx = private$data[censored == -2, obs_id],
cens_hi_idx = private$data[censored == 1, obs_id])

if (private$time_type == "relative") {
stan_data$t <- private$data[, t_since_last_exp]
} else {
stan_data$t <- private$data[, t_since_min_date]
}

stan_data$t <- private$data[, t_since_last_exp]
stan_data$X <- private$design_matrix
stan_data$P <- ncol(private$design_matrix)

Expand Down Expand Up @@ -217,19 +219,15 @@ biokinetics <- R6::R6Class(
#' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
#' will be treated as categorical variables. Default ~0.
#' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
#' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
initialize = function(priors = biokinetics_priors(),
data = NULL,
file_path = NULL,
covariate_formula = ~0,
preds_sd = 0.25,
time_type = "relative") {
preds_sd = 0.25) {
validate_priors(priors)
private$priors <- priors
validate_numeric(preds_sd)
private$preds_sd <- preds_sd
validate_time_type(time_type)
private$time_type <- time_type
validate_formula(covariate_formula)
private$covariate_formula <- covariate_formula
private$all_formula_vars <- all.vars(covariate_formula)
Expand All @@ -252,14 +250,14 @@ biokinetics <- R6::R6Class(
logger::log_info("Preparing data for stan")
private$data <- convert_log_scale(private$data, "value")
private$data[, `:=`(titre_type_num = as.numeric(as.factor(titre_type)),
obs_id = seq_len(.N))]
if (time_type == "relative") {
private$data[, t_since_last_exp := as.integer(date - last_exp_date, units = "days")]
} else {
private$data[, t_since_min_date := as.integer(date - min(date), units = "days")]
obs_id = seq_len(.N),
t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
if (!("censored" %in% colnames(private$data))) {
private$data$censored <- 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also make censored an optional column (default to all uncensored)

}
private$construct_design_matrix()
private$build_covariate_lookup_table()
private$build_pid_lookup()
private$prepare_stan_data()
logger::log_info("Retrieving compiled model")
private$model <- instantiate::stan_package_model(
Expand Down Expand Up @@ -338,8 +336,15 @@ biokinetics <- R6::R6Class(
logger::log_info("Extracting parameters")
dt_out <- private$extract_parameters(params, n_draws)

data.table::setcolorder(dt_out, c("n", "k", ".draw"))
data.table::setnames(dt_out, c("n", ".draw"), c("pid", "draw"))
data.table::setnames(dt_out, ".draw", "draw")

dt_out[, pid := names(private$pid_lookup)[n]]
if (is.numeric(private$data$pid)) {
dt_out[, pid := as.numeric(pid)]
}
dt_out$n <- NULL

data.table::setcolorder(dt_out, c("pid", "k", "draw"))

if (human_readable_covariates) {
logger::log_info("Recovering covariate names")
Expand All @@ -348,24 +353,21 @@ biokinetics <- R6::R6Class(
dt_out
},
#' @description Process the model results into a data table of titre values over time.
#' @return A data.table containing titre values at time points. If summarise = TRUE, columns are t, me, lo, hi,
#' titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are t, .draw
#' t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
#' @return A data.table containing titre values at time points. If summarise = TRUE, columns are time_since_last_exp,
#' me, lo, hi, titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are
#' time_since_last_exp, .draw, t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
#' titre_type and a column for each covariate in the hierarchical model. See the data vignette for details:
#' \code{vignette("data", package = "epikinetics")}
#' @param time_type One of 'relative' or 'absolute'. Default 'relative'.
#' @param t_max Integer. Maximum number of time points to include.
#' @param summarise Boolean. Default TRUE. If TRUE, summarises over draws from posterior parameter distributions to
#' return 0.025, 0.5 and 0.975 quantiles, labelled lo, me and hi, respectively. If FALSE returns values for individual
#' draws from posterior parameter distributions.
#' @param n_draws Integer. Maximum number of samples to include. Default 2500.
simulate_population_trajectories = function(
time_type = "relative",
t_max = 150,
summarise = TRUE,
n_draws = 2500) {
private$check_fitted()
validate_time_type(time_type)
validate_numeric(t_max)
validate_logical(summarise)
validate_numeric(n_draws)
Expand All @@ -378,12 +380,6 @@ biokinetics <- R6::R6Class(

dt_out <- private$recover_covariate_names(dt_sum)

if (time_type == "absolute") {
logger::log_info("Converting to absolute time")
dt_out[, date := private$data[, unique(min(date))] + t,
by = c(private$all_formula_vars, "titre_type")]
}

dt_out <- dt_out[
, lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]

Expand Down Expand Up @@ -445,14 +441,14 @@ biokinetics <- R6::R6Class(
#' @description Simulate individual trajectories from the model. This is
#' computationally expensive and may take a while to run if n_draws is large.
#' @return A data.table. If summarise = TRUE columns are calendar_date, titre_type, me, lo, hi, time_shift.
#' If summarise = FALSE, columns are pid, draw, t, mu, titre_type, exposure_date, calendar_date, time_shift
#' and a column for each covariate in the hierarchical model. See the data vignette for details:
#' If summarise = FALSE, columns are pid, draw, time_since_last_exp, mu, titre_type, exposure_day, calendar_day, time_shift
#' and a column for each covariate in the regression model. See the data vignette for details:
#' \code{vignette("data", package = "epikinetics")}.
#' @param summarise Boolean. If TRUE, average the individual trajectories to get lo, me and
#' hi values for the population, disaggregated by titre type. If FALSE return the indidivudal trajectories.
#' Default TRUE.
#' @param n_draws Integer. Maximum number of samples to draw. Default 2500.
#' @param time_shift Integer. Number of days to adjust the exposure date by. Default 0.
#' @param time_shift Integer. Number of days to adjust the exposure day by. Default 0.
simulate_individual_trajectories = function(
summarise = TRUE,
n_draws = 2500,
Expand All @@ -470,52 +466,59 @@ biokinetics <- R6::R6Class(
# Calculating the maximum time each individual has data for after the
# exposure of interest
dt_max_dates <- private$data[
, .(t_max = max(t_since_last_exp)), by = .(pid)]
, .(t_max = max(t_since_last_exp)), by = "pid"]

# A very small number of individuals have bleeds on the same day or a few days
# after their recorded exposure dates, resulting in very short trajectories.
# Adding a 50 day buffer to any individuals with less than or equal to 50 days
# of observations after their focal exposure
dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = .(pid)]
dt_max_dates <- dt_max_dates[t_max <= 50, t_max := 50, by = "pid"]

# Merging the parameter draws with the maximum time data.table
dt_params_ind <- merge(dt_params_ind, dt_max_dates, by = "pid")

dt_params_ind_trim <- dt_params_ind[, .SD[draw %in% 1:n_draws], by = pid]
# convert original pid to numeric pid
dt_params_ind[, pid := private$pid_lookup[pid]]

# Running the C++ code to simulate trajectories for each parameter sample
# for each individual
logger::log_info("Simulating individual trajectories")
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind_trim)
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)

dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu"))

# convert numeric pid to original pid
dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]
if (is.numeric(private$data$pid)) {
dt_params_ind_traj[, pid := as.numeric(pid)]
}

logger::log_info("Recovering covariate names")
dt_params_ind_traj <- private$recover_covariate_names(dt_params_ind_traj)

logger::log_info(paste("Calculating exposure dates. Adjusting exposures by", time_shift, "days"))
logger::log_info(paste("Calculating exposure days. Adjusting exposures by", time_shift, "days"))
dt_lookup <- private$data[, .(
exposure_date = min(last_exp_date) - time_shift),
exposure_day = min(last_exp_day) - time_shift),
by = c(private$all_formula_vars, "pid")]

dt_out <- merge(dt_params_ind_traj, dt_lookup, by = "pid")

dt_out[
, calendar_date := exposure_date + t,
, calendar_day := exposure_day + time_since_last_exp,
by = c(private$all_formula_vars, "pid", "titre_type")]

if (summarise) {
logger::log_info("Resampling")
dt_out <- dt_out[
!is.nan(mu), .(pop_mu_sum = mean(mosaic::resample(mu))),
by = c("calendar_date", "draw", "titre_type")]
by = c("calendar_day", "draw", "titre_type")]

logger::log_info("Summarising into population quantiles")
dt_out <- summarise_draws(
dt_out,
column_name = "pop_mu_sum",
by = c("calendar_date", "titre_type"))
by = c("calendar_day", "titre_type"))
}

dt_out[, time_shift := time_shift]
Expand Down
25 changes: 25 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,23 @@ convert_log_scale_inverse <- function(dt_in, vars_to_transform) {
dt_out
}

#' @title Combine titre data and infection data into biokinetics input format.
#'
#' @description The biokinetics class requires a single data table with titre readings
#' and last exposure times for all individuals. If you have exposure times and titre readings for the same
#' set of individuals in separate files, this function will combine them into a single data.table.
#' @return A data.table with all columns required by the biokinetics model.
#' @param dat_sero data.table containing titre values in the format required by biokinetics. See data vignette: \code{vignette("data", package = "epikinetics")}.
#' @param dat_inf data.table containing exposure days and person ids corresponding to those in dat_sero. By default the exposure days are expected in a column called 'day'.
#' @param exposure_column Default 'day'. The name of the column containing exposure days. These can be integers or dates.
#' @export
add_exposure_data <- function(dat_sero, dat_inf, exposure_column = 'day') {
validate_required_cols(dat_sero, required_cols = c("pid", "day", "titre_type", "value"))
validate_required_cols(dat_inf, required_cols = c("pid", exposure_column))
dat_inf[, "last_exp_day" := max(get(exposure_column)), by = "pid"]
merge(dat_sero, dat_inf[, c("pid", "last_exp_day")], by = "pid", allow.cartesian=TRUE)
}

summarise_draws <- function(dt_in, column_name, by = by) {
# Declare variables to suppress notes when compiling package
# https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
Expand Down Expand Up @@ -72,3 +89,11 @@ build_covariate_lookup_table <- function(data, design_matrix, all_formula_vars)
dt_out[, p_name := NULL]
dt_out
}

build_pid_lookup <- function(data) {
pids <- unique(data$pid)
ids <- seq_along(pids)
pid_lookup <- ids
names(pid_lookup) <- pids
pid_lookup
}
3 changes: 1 addition & 2 deletions R/validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ validate_formula_vars <- function(formula_vars, data) {
}
}

validate_required_cols <- function(dat) {
required_cols <- c("pid", "date", "last_exp_date", "titre_type", "value", "censored")
validate_required_cols <- function(dat, required_cols = c("pid", "day", "last_exp_day", "titre_type", "value")) {
missing_cols <- required_cols[!(required_cols %in% colnames(dat))]
if (length(missing_cols) > 0) {
stop(paste("Missing required columns:",
Expand Down
1 change: 0 additions & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
bin_stan <- file.path(libname, pkgname, "bin", "stan")
source_path <- file.path(libname, pkgname, "src", "stan")
fs::dir_copy(path = source_path, new_path = bin_stan, overwrite = TRUE)
message(fs::dir_ls(bin))
instantiate::stan_package_compile(
models = instantiate::stan_package_model_files(path = bin_stan),
cpp_options = list(stan_threads = TRUE),
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ devtools::test()

For snapshot testing of stan model outputs, we need the outputs to be exactly
reproducible. As well as setting a seed, this requires the machine environment
to be exactly the same, so we run these inside a Docker container, via a bash script:
to be exactly the same, so on CI we run these inside a Docker container, via a bash script:

```{shell}
./tests/snapshots/test-snapshots
Expand Down
2 changes: 1 addition & 1 deletion inst/ba2_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2022-01-31,2021-12-26,Delta,2560,1,Infection naive,BNT162b2,5
1,2022-01-31,2021-12-26,BA.1,1841.32228,0,Infection naive,BNT162b2,5
1,2022-01-31,2021-12-26,BA.2,786.8526775,0,Infection naive,BNT162b2,5
Expand Down
2 changes: 1 addition & 1 deletion inst/delta_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2021-03-10,2021-03-08,Ancestral,175.9349878,0,Infection naive,BNT162b2,2
1,2021-04-15,2021-03-08,Ancestral,607.57499,0,Infection naive,BNT162b2,2
1,2021-07-08,2021-03-08,Ancestral,179.0462942,0,Infection naive,BNT162b2,2
Expand Down
2 changes: 1 addition & 1 deletion inst/xbb_full.rds
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pid,date,last_exp_date,titre_type,value,censored,infection_history,last_vax_type,exp_num
pid,day,last_exp_day,titre_type,value,censored,infection_history,last_vax_type,exp_num
1,2022-12-07,2022-11-16,BA.5,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
1,2022-12-07,2022-11-16,BQ.1.1,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
1,2022-12-07,2022-11-16,XBB,2560,1,Previously infected (Omicron),BNT162b2+BA1,7
Expand Down
23 changes: 23 additions & 0 deletions man/add_exposure_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading