From 75c60f5476c70e78f1726c198bcd6940defad380 Mon Sep 17 00:00:00 2001 From: "alex.hill@gmail.com" Date: Thu, 17 Oct 2024 13:53:01 +0100 Subject: [PATCH] support log data --- R/biokinetics.R | 35 +++++++++++++++++++++------- src/stan/antibody_kinetics_main.stan | 2 +- vignettes/data.Rmd | 19 ++++++++------- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/R/biokinetics.R b/R/biokinetics.R index 041fa4b..2afbad5 100644 --- a/R/biokinetics.R +++ b/R/biokinetics.R @@ -8,6 +8,7 @@ biokinetics <- R6::R6Class( "biokinetics", cloneable = FALSE, private = list( + scale = NULL, priors = NULL, preds_sd = NULL, data = NULL, @@ -55,7 +56,7 @@ biokinetics <- R6::R6Class( }, construct_design_matrix = function() { var <- pid <- NULL - dt_design_matrix <- private$data[, .SD, .SDcols = private$all_formula_vars, by = pid] |> + dt_design_matrix <- as.character(private$data[, .SD, .SDcols = private$all_formula_vars, by = pid]) |> unique() # Build the full design matrix using model.matrix @@ -211,12 +212,14 @@ biokinetics <- R6::R6Class( #' @param priors Object of type \link[epikinetics]{biokinetics_priors}. Default biokinetics_priors(). #' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula #' will be treated as categorical variables. Default ~0. - #' @param preds_sd Standard deviation of predictor coefficients. Default 0.25. + #' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it + #' will be converted to a log scale for model fitting. initialize = function(priors = biokinetics_priors(), data = NULL, file_path = NULL, covariate_formula = ~0, - preds_sd = 0.25) { + preds_sd = 0.25, + scale = "natural") { validate_priors(priors) private$priors <- priors validate_numeric(preds_sd) @@ -224,6 +227,7 @@ biokinetics <- R6::R6Class( validate_formula(covariate_formula) private$covariate_formula <- covariate_formula private$all_formula_vars <- all.vars(covariate_formula) + private$scale <- scale if (is.null(data) && is.null(file_path)) { stop("One of 'data' or 'file_path' must be provided") } @@ -241,7 +245,9 @@ biokinetics <- R6::R6Class( validate_required_cols(private$data) validate_formula_vars(private$all_formula_vars, private$data) logger::log_info("Preparing data for stan") - private$data <- convert_log_scale(private$data, "value") + if (scale == "natural") { + private$data <- convert_log_scale(private$data, "value") + } private$data[, `:=`(obs_id = seq_len(.N), t_since_last_exp = as.integer(day - last_exp_day, units = "days"))] if (!("censored" %in% colnames(private$data))) { @@ -263,6 +269,9 @@ biokinetics <- R6::R6Class( get_stan_data = function() { private$stan_input_data }, + get_covariate_lookup_table = function() { + private$covariate_lookup_table + }, #' @description Fit the model and return CmdStanMCMC fitted model object. #' @return A CmdStanMCMC fitted model object: #' @param ... Named arguments to the `sample()` method of CmdStan model. @@ -376,6 +385,10 @@ biokinetics <- R6::R6Class( dt_out <- dt_out[ , lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)] + if (private$scale == "log") { + return(dt_out) + } + if (summarise) { dt_out <- convert_log_scale_inverse( dt_out, vars_to_transform = c("me", "lo", "hi")) @@ -416,8 +429,10 @@ biokinetics <- R6::R6Class( logger::log_info("Recovering covariate names") dt_peak_switch <- private$recover_covariate_names(dt_peak_switch) - dt_peak_switch <- convert_log_scale_inverse( - dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s")) + if (private$scale == "natural") { + dt_peak_switch <- convert_log_scale_inverse( + dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s")) + } logger::log_info("Calculating medians") dt_peak_switch[ @@ -476,10 +491,12 @@ biokinetics <- R6::R6Class( # Running the C++ code to simulate trajectories for each parameter sample # for each individual logger::log_info("Simulating individual trajectories") - dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind) + dt_params_ind_traj <- data.table::setDT(biokinetics_simulate_trajectories(dt_params_ind)) - dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp( - dt_params_ind_traj, vars_to_transform = "mu")) + if (private$scale == "natural") { + dt_params_ind_traj <- convert_log_scale_inverse_cpp( + dt_params_ind_traj, vars_to_transform = "mu") + } # convert numeric pid to original pid dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]] diff --git a/src/stan/antibody_kinetics_main.stan b/src/stan/antibody_kinetics_main.stan index 390acda..808fc88 100644 --- a/src/stan/antibody_kinetics_main.stan +++ b/src/stan/antibody_kinetics_main.stan @@ -32,7 +32,7 @@ data { array[N] int titre_type; // Titre type for each observation vector[N] t; // Time for each observation vector[N] value; // Observed titre values - array[N] int censored; // Censoring indicator: -2 for lo, -1 for me 1 for upper, 0 for none + array[N] int censored; // Censoring indicator: -1 for lo, 1 for upper, 0 for none // Indices for different censoring scenarios int N_uncens; // number of uncensored observations diff --git a/vignettes/data.Rmd b/vignettes/data.Rmd index e3a5320..3b03a51 100644 --- a/vignettes/data.Rmd +++ b/vignettes/data.Rmd @@ -19,19 +19,22 @@ knitr::opts_chunk$set( The model requires time series data about individual titre readings, along with last exposure times. Times can be relative (e.g. day of study) or absolute (i.e. precise calendar dates). The full list of required columns is as follows: -| name | type | description | required | -|--------------|----------------------|-----------------------------------------------------------------------------------------------------------------| -------- | -| pid | numeric or character | Unique identifier to identify a person across observations | T| -| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T| -| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T| -| titre_type | character | Name of the titre or biomarker | T| -| value | numeric | Titre value | T| +| name | type | description | required | +|--------------|----------------------|--------------------------------------------------------------------------------------------| -------- | +| pid | numeric or character | Unique identifier to identify a person across observations | T| +| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T| +| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T| +| titre_type | character | Name of the titre or biomarker | T| +| value | numeric | Titre value | T| | censored | -1, 0 or 1 | Optional column. Whether this observation should be treated as censored: -1 for lower, 1 for upper, 0 for none. | F| It can also contain further columns for any covariates to be included in the model. The data files installed with this package have additional columns infection_history, last_vax_type, and exp_num. The model also accepts a covariate formula to define the regression model. The variables in the formula must correspond to column names in the dataset. Note that all variables will be treated as **categorical variables**; that is, converted to factors regardless of their input type. +Note also that the `value` column is assumed to be on a natural scale by default, and will be converted to a log scale for model fitting. If your data +is already on a log scale, you must pass the `log=TRUE` argument when initialising the biokinetics class. See [biokinetics](../reference/biokinetics.html). + ## Example ```{r} @@ -45,7 +48,7 @@ After fitting a model, a [CmdStanMCMC](https://mc-stan.org/cmdstanr/reference/Cm that users who are already familiar with `cmdstanr` are free to do what they want with the fitted model. **Important!** -**The data provided to `biokinetics$new` is converted to a base2 log scale before inference is performed. This means that if working +**If you provide data on a natural scale, it will be converted to a base2 log scale before inference is performed. This means that if working directly with the fitted `CmdStanMCMC` all values will be on this scale. The package provides a helper function for converting back to the original scale: [convert_log_scale_inverse](../reference/convert_log_scale_inverse.html).**