optionally truncate values

seroanalytics · Nov 9, 2024 · 0cbe811 · 0cbe811
1 parent 93ad627
commit 0cbe811
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 82 deletions.
diff --git a/R/biokinetics.R b/R/biokinetics.R
@@ -237,31 +237,35 @@ biokinetics <- R6::R6Class(
     }
   ),
   public = list(
-        #' @description Initialise the kinetics model.
-        #' @return An epikinetics::biokinetics object.
-        #' @param data Optional data table of model inputs. One of data or file must be provided. See the data vignette
-        #' for required columns: \code{vignette("data", package = "epikinetics")}.
-        #' @param file_path Optional file path to model inputs in CSV format. One of data or file must be provided.
-        #' @param priors Object of type \link[epikinetics]{biokinetics_priors}. Default biokinetics_priors().
-        #' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
-        #' will be treated as categorical variables. Default ~0.
-        #' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
-        #' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it
-        #' will be converted to a log scale for model fitting.
-        #' @param upper_detection_limit Optional upper detection limit of the titre used. This is needed to construct a likelihood for upper censored
-        #' values, so only needs to be provided if you have such values in the dataset. If not provided, the model will default
-        #' to using the largest value in the dataset as the upper detection limit.
-        #' @param lower_detection_limit Optional lower detection limit of the titre used. This is needed to construct a likelihood for lower censored
-        #' values, so only needs to be provided if you have such values in the dataset. If not provided, the model will default
-        #' to using the smallest value in the dataset as the lower detection limit.
+    #' @description Initialise the kinetics model.
+    #' @return An epikinetics::biokinetics object.
+    #' @param data Optional data table of model inputs. One of data or file must be provided. See the data vignette
+    #' for required columns: \code{vignette("data", package = "epikinetics")}.
+    #' @param file_path Optional file path to model inputs in CSV format. One of data or file must be provided.
+    #' @param priors Object of type \link[epikinetics]{biokinetics_priors}. Default biokinetics_priors().
+    #' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
+    #' will be treated as categorical variables. Default ~0.
+    #' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
+    #' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it
+    #' will be converted to a log scale for model fitting.
+    #' @param upper_detection_limit Optional upper detection limit of the titre used. This is needed to construct a likelihood for upper censored
+    #' values, so only needs to be provided if you have such values in the dataset. If not provided, the model will default
+    #' to using the largest value in the dataset as the upper detection limit.
+    #' @param lower_detection_limit Optional lower detection limit of the titre used. This is needed to construct a likelihood for lower censored
+    #' values, so only needs to be provided if you have such values in the dataset. If not provided, the model will default
+    #' to using the smallest value in the dataset as the lower detection limit.
+    #' @param truncate_upper Logical. Whether values greater than the upper detection limit should be truncated, i.e. set to the same value as the limit. Default TRUE.
+    #' @param truncate_lower Logical. Whether values smaller than the lower detection limit should be truncated, i.e. set to the same value as the limit. Default TRUE.
     initialize = function(priors = biokinetics_priors(),
                           data = NULL,
                           file_path = NULL,
                           covariate_formula = ~0,
                           preds_sd = 0.25,
                           scale = "natural",
                           upper_detection_limit = NULL,
-                          lower_detection_limit = NULL) {
+                          lower_detection_limit = NULL,
+                          truncate_upper = TRUE,
+                          truncate_lower = TRUE) {
       validate_priors(priors)
       private$priors <- priors
       validate_numeric(preds_sd)
@@ -287,19 +291,24 @@ biokinetics <- R6::R6Class(
       validate_required_cols(private$data)
       validate_formula_vars(private$all_formula_vars, private$data)
       logger::log_info("Preparing data for stan")
-      # this is used to scale the data if natural scale data is provided
-      private$smallest_value <- min(private$data$value)
       private$get_upper_detection_limit(upper_detection_limit)
       private$get_lower_detection_limit(lower_detection_limit)
+      if (truncate_upper) {
+        private$data[, value := ifelse(value > private$upper_detection_limit, private$upper_detection_limit, value)]
+      }
+      if (truncate_lower) {
+        private$data[, value := ifelse(value < private$lower_detection_limit, private$lower_detection_limit, value)]
+      }
       private$data[, `:=`(obs_id = seq_len(.N),
                           time_since_last_exp = as.integer(day - last_exp_day, units = "days"),
-                          censored_lo = value <= private$lower_detection_limit,
-                          censored_hi = value >= private$upper_detection_limit)]
+                          censored_lo = value == private$lower_detection_limit,
+                          censored_hi = value == private$upper_detection_limit)]
       private$construct_design_matrix()
       private$build_covariate_lookup_table()
       private$build_pid_lookup()
       private$build_titre_type_lookup()
       if (scale == "natural") {
+        private$smallest_value <- min(private$data$value)
         private$data <- convert_log2_scale(private$data,
                                            smallest_value = private$smallest_value,
                                            vars_to_transform = "value")
@@ -311,12 +320,12 @@ biokinetics <- R6::R6Class(
         package = "epikinetics"
       )
     },
-        #' @description Plot the kinetics trajectory predicted by the model priors.
-        #' Note that this is on a log scale, regardless of whether the data was provided
-        #' on a log or a natural scale.
-        #' @return A ggplot2 object.
-        #' @param tmax Integer. The number of time points in each simulated trajectory. Default 150.
-        #' @param n_draws Integer. The number of trajectories to simulate. Default 2000.
+    #' @description Plot the kinetics trajectory predicted by the model priors.
+    #' Note that this is on a log scale, regardless of whether the data was provided
+    #' on a log or a natural scale.
+    #' @return A ggplot2 object.
+    #' @param tmax Integer. The number of time points in each simulated trajectory. Default 150.
+    #' @param n_draws Integer. The number of trajectories to simulate. Default 2000.
     plot_prior_predictive = function(tmax = 150,
                                      n_draws = 2000) {
       plot(private$priors,
@@ -326,41 +335,41 @@ biokinetics <- R6::R6Class(
            upper_detection_limit = private$stan_input_data$upper_detection_limit,
            lower_detection_limit = private$stan_input_data$lower_detection_limit)
     },
-        #' @description Plot model input data with a smoothing function. Note that
-        #' this plot is of the data as provided to the Stan model so is on a log scale,
-        #' regardless of whether data was provided on a log or a natural scale.
-        #' @param tmax Integer. Maximum time since last exposure to include. Default 150.
-        #' @return A ggplot2 object.
+    #' @description Plot model input data with a smoothing function. Note that
+    #' this plot is of the data as provided to the Stan model so is on a log scale,
+    #' regardless of whether data was provided on a log or a natural scale.
+    #' @param tmax Integer. Maximum time since last exposure to include. Default 150.
+    #' @return A ggplot2 object.
     plot_model_inputs = function(tmax = 150) {
       plot_sero_data(private$data,
                      tmax = tmax,
                      covariates = private$all_formula_vars,
                      upper_detection_limit = private$stan_input_data$upper_detection_limit,
                      lower_detection_limit = private$stan_input_data$lower_detection_limit)
     },
-        #' @description View the data that is passed to the stan model, for debugging purposes.
-        #' @return A list of arguments that will be passed to the stan model.
+    #' @description View the data that is passed to the stan model, for debugging purposes.
+    #' @return A list of arguments that will be passed to the stan model.
     get_stan_data = function() {
       private$stan_input_data
     },
-        #' @description View the mapping of human readable covariate names to the model variable p.
-        #' @return A data.table mapping the model variable p to human readable covariates.
+    #' @description View the mapping of human readable covariate names to the model variable p.
+    #' @return A data.table mapping the model variable p to human readable covariates.
     get_covariate_lookup_table = function() {
       private$covariate_lookup_table
     },
-        #' @description Fit the model and return CmdStanMCMC fitted model object.
-        #' @return A CmdStanMCMC fitted model object: <https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html>
-        #' @param ... Named arguments to the `sample()` method of CmdStan model.
-        #'   objects: <https://mc-stan.org/cmdstanr/reference/model-method-sample.html>
+    #' @description Fit the model and return CmdStanMCMC fitted model object.
+    #' @return A CmdStanMCMC fitted model object: <https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html>
+    #' @param ... Named arguments to the `sample()` method of CmdStan model.
+    #'   objects: <https://mc-stan.org/cmdstanr/reference/model-method-sample.html>
     fit = function(...) {
       logger::log_info("Fitting model")
       private$fitted <- private$model$sample(private$stan_input_data, ...)
       private$fitted
     },
-        #' @description Extract fitted population parameters
-        #' @return A data.table
-        #' @param n_draws Integer. Default 2000.
-        #' @param human_readable_covariates Logical. Default TRUE.
+    #' @description Extract fitted population parameters
+    #' @return A data.table
+    #' @param n_draws Integer. Default 2000.
+    #' @param human_readable_covariates Logical. Default TRUE.
     extract_population_parameters = function(n_draws = 2000,
                                              human_readable_covariates = TRUE) {
       private$check_fitted()
@@ -393,11 +402,11 @@ biokinetics <- R6::R6Class(
       }
       dt_out
     },
-        #' @description Extract fitted individual parameters
-        #' @return A data.table
-        #' @param n_draws Integer. Default 2000.
-        #' @param include_variation_params Logical
-        #' @param human_readable_covariates Logical. Default TRUE.
+    #' @description Extract fitted individual parameters
+    #' @return A data.table
+    #' @param n_draws Integer. Default 2000.
+    #' @param include_variation_params Logical
+    #' @param human_readable_covariates Logical. Default TRUE.
     extract_individual_parameters = function(n_draws = 2000,
                                              include_variation_params = TRUE,
                                              human_readable_covariates = TRUE) {
@@ -430,17 +439,17 @@ biokinetics <- R6::R6Class(
       }
       dt_out
     },
-        #' @description Process the model results into a data table of titre values over time.
-        #' @return A data.table containing titre values at time points. If summarise = TRUE, columns are time_since_last_exp,
-        #' me, lo, hi, titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are
-        #' time_since_last_exp, .draw, t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
-        #' titre_type and a column for each covariate in the hierarchical model. See the data vignette for details:
-        #' \code{vignette("data", package = "epikinetics")}
-        #' @param t_max Integer. Maximum number of time points to include.
-        #' @param summarise Boolean. Default TRUE. If TRUE, summarises over draws from posterior parameter distributions to
-        #' return 0.025, 0.5 and 0.975 quantiles, labelled lo, me and hi, respectively. If FALSE returns values for individual
-        #' draws from posterior parameter distributions.
-        #' @param n_draws Integer. Maximum number of samples to include. Default 2000.
+    #' @description Process the model results into a data table of titre values over time.
+    #' @return A data.table containing titre values at time points. If summarise = TRUE, columns are time_since_last_exp,
+    #' me, lo, hi, titre_type, and a column for each covariate in the hierarchical model. If summarise = FALSE, columns are
+    #' time_since_last_exp, .draw, t0_pop, tp_pop, ts_pop, m1_pop, m2_pop, m3_pop, beta_t0, beta_tp, beta_ts, beta_m1, beta_m2, beta_m3, mu
+    #' titre_type and a column for each covariate in the hierarchical model. See the data vignette for details:
+    #' \code{vignette("data", package = "epikinetics")}
+    #' @param t_max Integer. Maximum number of time points to include.
+    #' @param summarise Boolean. Default TRUE. If TRUE, summarises over draws from posterior parameter distributions to
+    #' return 0.025, 0.5 and 0.975 quantiles, labelled lo, me and hi, respectively. If FALSE returns values for individual
+    #' draws from posterior parameter distributions.
+    #' @param n_draws Integer. Maximum number of samples to include. Default 2000.
     simulate_population_trajectories = function(
       t_max = 150,
       summarise = TRUE,
@@ -485,11 +494,11 @@ biokinetics <- R6::R6Class(
 
       dt_out
     },
-        #' @description Process the stan model results into a data.table.
-        #' @return A data.table of peak and set titre values. Columns are tire_type, mu_p, mu_s, rel_drop_me, mu_p_me,
-        #' mu_s_me, and a column for each covariate. See the data vignette for details:
-        #' \code{vignette("data", package = "epikinetics")}
-        #' @param n_draws Integer. Maximum number of samples to include. Default 2000.
+    #' @description Process the stan model results into a data.table.
+    #' @return A data.table of peak and set titre values. Columns are tire_type, mu_p, mu_s, rel_drop_me, mu_p_me,
+    #' mu_s_me, and a column for each covariate. See the data vignette for details:
+    #' \code{vignette("data", package = "epikinetics")}
+    #' @param n_draws Integer. Maximum number of samples to include. Default 2000.
     population_stationary_points = function(n_draws = 2000) {
       private$check_fitted()
       validate_numeric(n_draws)
@@ -535,17 +544,17 @@ biokinetics <- R6::R6Class(
         mu_s_me = quantile(mu_s, 0.5)),
           by = c(private$all_formula_vars, "titre_type")]
     },
-        #' @description Simulate individual trajectories from the model. This is
-        #' computationally expensive and may take a while to run if n_draws is large.
-        #' @return A data.table. If summarise = TRUE columns are calendar_date, titre_type, me, lo, hi, time_shift.
-        #' If summarise = FALSE, columns are pid, draw, time_since_last_exp, mu, titre_type, exposure_day, calendar_day, time_shift
-        #' and a column for each covariate in the regression model. See the data vignette for details:
-        #' \code{vignette("data", package = "epikinetics")}.
-        #' @param summarise Boolean. If TRUE, average the individual trajectories to get lo, me and
-        #' hi values for the population, disaggregated by titre type. If FALSE return the indidivudal trajectories.
-        #' Default TRUE.
-        #' @param n_draws Integer. Maximum number of samples to draw. Default 2000.
-        #' @param time_shift Integer. Number of days to adjust the exposure day by. Default 0.
+    #' @description Simulate individual trajectories from the model. This is
+    #' computationally expensive and may take a while to run if n_draws is large.
+    #' @return A data.table. If summarise = TRUE columns are calendar_date, titre_type, me, lo, hi, time_shift.
+    #' If summarise = FALSE, columns are pid, draw, time_since_last_exp, mu, titre_type, exposure_day, calendar_day, time_shift
+    #' and a column for each covariate in the regression model. See the data vignette for details:
+    #' \code{vignette("data", package = "epikinetics")}.
+    #' @param summarise Boolean. If TRUE, average the individual trajectories to get lo, me and
+    #' hi values for the population, disaggregated by titre type. If FALSE return the indidivudal trajectories.
+    #' Default TRUE.
+    #' @param n_draws Integer. Maximum number of samples to draw. Default 2000.
+    #' @param time_shift Integer. Number of days to adjust the exposure day by. Default 0.
     simulate_individual_trajectories = function(
       summarise = TRUE,
       n_draws = 2000,

diff --git a/R/plot.R b/R/plot.R
@@ -21,7 +21,7 @@ plot.biokinetics_priors <- function(x,
 
   # Declare variables to suppress notes when compiling package
   # https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
-  t0 <- tp <- ts <- m1 <- m2 <- m3 <- censored <- NULL
+  t0 <- tp <- ts <- m1 <- m2 <- m3 <- NULL
   time_since_last_exp <- me <- lo <- hi <- value <- mu <- NULL
 
   if (!is.null(data)) {
@@ -81,7 +81,7 @@ plot_sero_data <- function(data,
   data <- data[time_since_last_exp <= tmax,]
   # Declare variables to suppress notes when compiling package
   # https://github.com/Rdatatable/data.table/issues/850#issuecomment-259466153
-  time_since_last_exp <- value <- titre_type <- censored <- NULL
+  time_since_last_exp <- value <- titre_type <- NULL
 
   plot <- ggplot(data) +
     geom_point(aes(x = time_since_last_exp, y = value, colour = titre_type),

diff --git a/man/biokinetics.Rd b/man/biokinetics.Rd
diff --git a/tests/testthat/test-plots.R b/tests/testthat/test-plots.R
@@ -41,13 +41,16 @@ test_that("Prior predictions from model are the same", {
 
 test_that("Can plot input data", {
   data <- data.table::fread(system.file("delta_full.rds", package = "epikinetics"))
-  mod <- biokinetics$new(data = data, lower_detection_limit = 40)
+  mod <- biokinetics$new(data = data,
+                         lower_detection_limit = 40,
+                         truncate_lower = FALSE)
   suppressWarnings({plot <- mod$plot_model_inputs()})
   vdiffr::expect_doppelganger("inputdata", plot)
 
   mod <- biokinetics$new(data = data,
                          lower_detection_limit = 40,
-                         covariate_formula = ~0 + infection_history)
+                         covariate_formula = ~0 + infection_history,
+                         truncate_lower = FALSE)
   suppressWarnings({plot <- mod$plot_model_inputs()})
   vdiffr::expect_doppelganger("inputdata_covariates", plot)
 })