Skip to content

Commit

Permalink
Merge pull request #19 from seroanalytics/log_data
Browse files Browse the repository at this point in the history
Support input data on log scale
  • Loading branch information
hillalex authored Oct 17, 2024
2 parents bc4da25 + 3fe300a commit 8155ae7
Show file tree
Hide file tree
Showing 14 changed files with 129 additions and 42 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
export(add_exposure_data)
export(biokinetics)
export(biokinetics_priors)
export(convert_log_scale_inverse)
export(convert_log2_scale_inverse)
importFrom(R6,R6Class)
importFrom(data.table,":=")
importFrom(data.table,.BY)
Expand Down
36 changes: 28 additions & 8 deletions R/biokinetics.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ biokinetics <- R6::R6Class(
"biokinetics",
cloneable = FALSE,
private = list(
scale = NULL,
priors = NULL,
preds_sd = NULL,
data = NULL,
Expand Down Expand Up @@ -212,18 +213,22 @@ biokinetics <- R6::R6Class(
#' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
#' will be treated as categorical variables. Default ~0.
#' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
#' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it
#' will be converted to a log scale for model fitting.
initialize = function(priors = biokinetics_priors(),
data = NULL,
file_path = NULL,
covariate_formula = ~0,
preds_sd = 0.25) {
preds_sd = 0.25,
scale = "natural") {
validate_priors(priors)
private$priors <- priors
validate_numeric(preds_sd)
private$preds_sd <- preds_sd
validate_formula(covariate_formula)
private$covariate_formula <- covariate_formula
private$all_formula_vars <- all.vars(covariate_formula)
private$scale <- scale
if (is.null(data) && is.null(file_path)) {
stop("One of 'data' or 'file_path' must be provided")
}
Expand All @@ -241,7 +246,9 @@ biokinetics <- R6::R6Class(
validate_required_cols(private$data)
validate_formula_vars(private$all_formula_vars, private$data)
logger::log_info("Preparing data for stan")
private$data <- convert_log_scale(private$data, "value")
if (scale == "natural") {
private$data <- convert_log2_scale(private$data, "value")
}
private$data[, `:=`(obs_id = seq_len(.N),
t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
if (!("censored" %in% colnames(private$data))) {
Expand All @@ -263,6 +270,11 @@ biokinetics <- R6::R6Class(
get_stan_data = function() {
private$stan_input_data
},
#' @description View the mapping of human readable covariate names to the model variable p.
#' @return A data.table mapping the model variable p to human readable covariates.
get_covariate_lookup_table = function() {
private$covariate_lookup_table
},
#' @description Fit the model and return CmdStanMCMC fitted model object.
#' @return A CmdStanMCMC fitted model object: <https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html>
#' @param ... Named arguments to the `sample()` method of CmdStan model.
Expand Down Expand Up @@ -376,11 +388,15 @@ biokinetics <- R6::R6Class(
dt_out <- dt_out[
, lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]

if (private$scale == "log") {
return(dt_out)
}

if (summarise) {
dt_out <- convert_log_scale_inverse(
dt_out <- convert_log2_scale_inverse(
dt_out, vars_to_transform = c("me", "lo", "hi"))
} else {
dt_out <- convert_log_scale_inverse(
dt_out <- convert_log2_scale_inverse(
dt_out, vars_to_transform = "mu")
}
dt_out
Expand Down Expand Up @@ -416,8 +432,10 @@ biokinetics <- R6::R6Class(
logger::log_info("Recovering covariate names")
dt_peak_switch <- private$recover_covariate_names(dt_peak_switch)

dt_peak_switch <- convert_log_scale_inverse(
dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
if (private$scale == "natural") {
dt_peak_switch <- convert_log2_scale_inverse(
dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
}

logger::log_info("Calculating medians")
dt_peak_switch[
Expand Down Expand Up @@ -478,8 +496,10 @@ biokinetics <- R6::R6Class(
logger::log_info("Simulating individual trajectories")
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)

dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu"))
if (private$scale == "natural") {
dt_params_ind_traj <- data.table::setDT(convert_log2_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu"))
}

# convert numeric pid to original pid
dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]
Expand Down
4 changes: 2 additions & 2 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by cpp11: do not edit by hand

convert_log_scale_inverse_cpp <- function(dt, vars_to_transform) {
.Call(`_epikinetics_convert_log_scale_inverse_cpp`, dt, vars_to_transform)
convert_log2_scale_inverse_cpp <- function(dt, vars_to_transform) {
.Call(`_epikinetics_convert_log2_scale_inverse_cpp`, dt, vars_to_transform)
}

simulate_trajectories_cpp <- function(person_params) {
Expand Down
10 changes: 5 additions & 5 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
convert_log_scale <- function(
convert_log2_scale <- function(
dt_in, vars_to_transform = "titre",
simplify_limits = TRUE) {

Expand All @@ -14,14 +14,14 @@ convert_log_scale <- function(

#' @title Invert base 2 log scale conversion
#'
#' @description User provided data is converted to a base 2 log scale before model fitting. This
#' function reverses that transformation. This function does not modify the provided data.table in-place,
#' but returns a transformed copy.
#' @description Natural scale data is converted to a base 2 log scale before model fitting. This
#' function reverses that transformation and may be useful if working directly with fitted parameters.
#' This function does not modify the provided data.table in-place, but returns a transformed copy.
#' @return A data.table, identical to the input data but with specified columns transformed.
#' @param dt_in data.table containing data to be transformed from base 2 log to natural scale.
#' @param vars_to_transform Names of columns to apply the transformation to.
#' @export
convert_log_scale_inverse <- function(dt_in, vars_to_transform) {
convert_log2_scale_inverse <- function(dt_in, vars_to_transform) {
dt_out <- data.table::copy(dt_in)
for (var in vars_to_transform) {
# Reverse the log2 transformation and multiplication by 5.
Expand Down
20 changes: 19 additions & 1 deletion man/biokinetics.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/convert_log_scale_inverse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
using namespace cpp11;

[[cpp11::register]]
cpp11::data_frame convert_log_scale_inverse_cpp(cpp11::writable::list dt,
cpp11::data_frame convert_log2_scale_inverse_cpp(cpp11::writable::list dt,
cpp11::strings vars_to_transform) {

for (int i = 0; i < vars_to_transform.size(); i++) {
Expand Down
10 changes: 5 additions & 5 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#include <R_ext/Visibility.h>

// convert_log_scale_inverse.cpp
cpp11::data_frame convert_log_scale_inverse_cpp(cpp11::writable::list dt, cpp11::strings vars_to_transform);
extern "C" SEXP _epikinetics_convert_log_scale_inverse_cpp(SEXP dt, SEXP vars_to_transform) {
cpp11::data_frame convert_log2_scale_inverse_cpp(cpp11::writable::list dt, cpp11::strings vars_to_transform);
extern "C" SEXP _epikinetics_convert_log2_scale_inverse_cpp(SEXP dt, SEXP vars_to_transform) {
BEGIN_CPP11
return cpp11::as_sexp(convert_log_scale_inverse_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::writable::list>>(dt), cpp11::as_cpp<cpp11::decay_t<cpp11::strings>>(vars_to_transform)));
return cpp11::as_sexp(convert_log2_scale_inverse_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::writable::list>>(dt), cpp11::as_cpp<cpp11::decay_t<cpp11::strings>>(vars_to_transform)));
END_CPP11
}
// simulate_trajectories.cpp
Expand All @@ -22,8 +22,8 @@ extern "C" SEXP _epikinetics_simulate_trajectories_cpp(SEXP person_params) {

extern "C" {
static const R_CallMethodDef CallEntries[] = {
{"_epikinetics_convert_log_scale_inverse_cpp", (DL_FUNC) &_epikinetics_convert_log_scale_inverse_cpp, 2},
{"_epikinetics_simulate_trajectories_cpp", (DL_FUNC) &_epikinetics_simulate_trajectories_cpp, 1},
{"_epikinetics_convert_log2_scale_inverse_cpp", (DL_FUNC) &_epikinetics_convert_log2_scale_inverse_cpp, 2},
{"_epikinetics_simulate_trajectories_cpp", (DL_FUNC) &_epikinetics_simulate_trajectories_cpp, 1},
{NULL, NULL, 0}
};
}
Expand Down
2 changes: 1 addition & 1 deletion src/stan/antibody_kinetics_main.stan
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ data {
array[N] int<lower=1, upper=K> titre_type; // Titre type for each observation
vector[N] t; // Time for each observation
vector[N] value; // Observed titre values
array[N] int<lower=-2, upper=1> censored; // Censoring indicator: -2 for lo, -1 for me 1 for upper, 0 for none
array[N] int<lower=-1, upper=1> censored; // Censoring indicator: -1 for lo, 1 for upper, 0 for none

// Indices for different censoring scenarios
int N_uncens; // number of uncensored observations
Expand Down
8 changes: 4 additions & 4 deletions tests/testthat/test-convert-log-scale.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
test_that("Can convert to and from log scale in R", {
inputs <- data.table::fread(test_path("testdata", "testdata.csv"))
log_inputs <- convert_log_scale(inputs, "me", simplify_limits = FALSE)
unlog_inputs <- convert_log_scale_inverse(log_inputs, "me")
log_inputs <- convert_log2_scale(inputs, "me", simplify_limits = FALSE)
unlog_inputs <- convert_log2_scale_inverse(log_inputs, "me")

expect_equal(inputs, unlog_inputs)
})

test_that("Can convert from log scale in R", {
inputs <- data.table::fread(test_path("testdata", "testdata.csv"))
res <- convert_log_scale_inverse(
res <- convert_log2_scale_inverse(
inputs, vars_to_transform = c("me", "lo"))

expect_equal(res$me, 5 * 2^inputs$me)
Expand All @@ -19,7 +19,7 @@ test_that("Can convert from log scale in R", {

test_that("Can convert from log scale in Cpp", {
inputs <- data.table::fread(test_path("testdata", "testdata.csv"))
rescpp <- convert_log_scale_inverse_cpp(
rescpp <- convert_log2_scale_inverse_cpp(
inputs, vars_to_transform = c("me", "lo"))

expect_equal(rescpp$me, 5 * 2^(inputs$me))
Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,17 @@ test_that("Can handle non-numeric pids", {
stan_data <- mod$get_stan_data()
expect_equivalent(stan_data$id, ids)
})

test_that("Natural scale data is converted to log scale for stan", {
dat <- data.table::fread(system.file("delta_full.rds", package = "epikinetics"))
mod <- biokinetics$new(data = dat)
stan_data <- mod$get_stan_data()
expect_equivalent(stan_data$value, convert_log2_scale(dat, "value")$value)
})

test_that("Log scale data is passed directly to stan", {
dat <- data.table::fread(system.file("delta_full.rds", package = "epikinetics"))
mod <- biokinetics$new(data = dat, scale = "log")
stan_data <- mod$get_stan_data()
expect_equivalent(stan_data$value, dat$value)
})
16 changes: 16 additions & 0 deletions tests/testthat/test-simulate-individual-trajectories.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,19 @@ test_that("Exposure dates are brought forward by time_shift days", {
expect_equal(trajectories$mu, trajectories_shifted$mu)
expect_true(all(as.numeric(difftime(trajectories$exposure_date, trajectories_shifted$exposure_date, units = "days")) == 75))
})

test_that("Natural scale data is returned on natural scale", {
mod <- biokinetics$new(file_path = system.file("delta_full.rds", package = "epikinetics"),
covariate_formula = ~0 + infection_history, scale = "natural")
mod$fit()
trajectories <- mod$simulate_individual_trajectories(summarise = TRUE, n_draws = 10)
expect_false(all(trajectories$me < 10))
})

test_that("Log scale data is returned on log scale", {
mod <- biokinetics$new(file_path = system.file("delta_full.rds", package = "epikinetics"),
covariate_formula = ~0 + infection_history, scale = "log")
mod$fit()
trajectories <- mod$simulate_individual_trajectories(summarise = TRUE, n_draws = 10)
expect_true(all(trajectories$me < 10))
})
16 changes: 16 additions & 0 deletions tests/testthat/test-simulate-population-trajectories.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,19 @@ test_that("Only times up to t_max are returned", {
trajectories <- mod$simulate_population_trajectories(summarise = TRUE, t_max = 10)
expect_true(all(trajectories$t <= 10))
})

test_that("Natural scale data is returned on natural scale", {
mod <- biokinetics$new(file_path = system.file("delta_full.rds", package = "epikinetics"),
covariate_formula = ~0 + infection_history, scale = "natural")
mod$fit()
trajectories <- mod$simulate_population_trajectories(summarise = TRUE, t_max = 10)
expect_false(all(trajectories$me < 10))
})

test_that("Log scale data is returned on log scale", {
mod <- biokinetics$new(file_path = system.file("delta_full.rds", package = "epikinetics"),
covariate_formula = ~0 + infection_history, scale = "log")
mod$fit()
trajectories <- mod$simulate_population_trajectories(summarise = TRUE, t_max = 10)
expect_true(all(trajectories$me < 10))
})
19 changes: 11 additions & 8 deletions vignettes/data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@ knitr::opts_chunk$set(
The model requires time series data about individual titre readings, along with last exposure times. Times can be relative (e.g. day of study) or absolute (i.e. precise calendar dates).
The full list of required columns is as follows:

| name | type | description | required |
|--------------|----------------------|-----------------------------------------------------------------------------------------------------------------| -------- |
| pid | numeric or character | Unique identifier to identify a person across observations | T|
| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T|
| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T|
| titre_type | character | Name of the titre or biomarker | T|
| value | numeric | Titre value | T|
| name | type | description | required |
|--------------|----------------------|--------------------------------------------------------------------------------------------| -------- |
| pid | numeric or character | Unique identifier to identify a person across observations | T|
| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T|
| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T|
| titre_type | character | Name of the titre or biomarker | T|
| value | numeric | Titre value | T|
| censored | -1, 0 or 1 | Optional column. Whether this observation should be treated as censored: -1 for lower, 1 for upper, 0 for none. | F|

It can also contain further columns for any covariates to be included in the model. The data files installed with this package have additional columns infection_history, last_vax_type, and exp_num.

The model also accepts a covariate formula to define the regression model. The variables in the formula must correspond to column names in the dataset. Note that all variables will be treated as **categorical variables**; that is, converted to factors regardless of their input type.

Note also that the `value` column is assumed to be on a natural scale by default, and will be converted to a log scale for model fitting. If your data
is already on a log scale, you must pass the `log=TRUE` argument when initialising the biokinetics class. See [biokinetics](../reference/biokinetics.html).

## Example

```{r}
Expand All @@ -45,7 +48,7 @@ After fitting a model, a [CmdStanMCMC](https://mc-stan.org/cmdstanr/reference/Cm
that users who are already familiar with `cmdstanr` are free to do what they want with the fitted model.

**Important!**
**The data provided to `biokinetics$new` is converted to a base2 log scale before inference is performed. This means that if working
**If you provide data on a natural scale, it will be converted to a base2 log scale before inference is performed. This means that if working
directly with the fitted `CmdStanMCMC` all values will be on this scale. The package provides a helper function for converting
back to the original scale: [convert_log_scale_inverse](../reference/convert_log_scale_inverse.html).**

Expand Down

0 comments on commit 8155ae7

Please sign in to comment.