From 6f0eb316a055d6d62402142d305364b2636120f8 Mon Sep 17 00:00:00 2001
From: Chirag Kumar <kzs9@cdc.gov>
Date: Sun, 15 Dec 2024 22:42:08 +0000
Subject: [PATCH] documentation and tests

---
 .gitignore                        |   2 +
 docs/natural-history-inputs.md    |  28 +++
 docs/viral-load-infectiousness.md |  23 ---
 src/natural_history_manager.rs    | 280 +++++++++++++++++++++++++++---
 src/transmission_manager.rs       |  14 +-
 tests/data/empty.csv              |   1 +
 tests/data/three_columns.csv      |   2 +
 7 files changed, 297 insertions(+), 53 deletions(-)
 create mode 100644 docs/natural-history-inputs.md
 delete mode 100644 docs/viral-load-infectiousness.md
 create mode 100644 tests/data/empty.csv
 create mode 100644 tests/data/three_columns.csv

diff --git a/.gitignore b/.gitignore
index c4c7a68..8792043 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,8 @@
 !input/people_test.csv
 !input/gi_trajectories.csv
 !tests/data/gi_trajectory.csv
+!tests/data/three_columns.csv
+!tests/data/empty.csv
 
 #####
 # Python
diff --git a/docs/natural-history-inputs.md b/docs/natural-history-inputs.md
new file mode 100644
index 0000000..f7b0002
--- /dev/null
+++ b/docs/natural-history-inputs.md
@@ -0,0 +1,28 @@
+# Infectiousness over time
+
+## Overview
+
+We provide a way for reading in a user-specified infectiousness over time distribution (generation interval)
+and appropriately scheduling infection attempts based on the distribution. The user provides an input file
+that contains samples from the cumulative distribution function (CDF) of the generation interval (GI) over
+time at a specified $\Delta t$, describing the fraction of an individual's infectiousness that has passed
+by a given time. The input data are assumed to have a format where the columns represent the times since
+the infection attempt (so starting at $t = 0$) and the entries in each row describe the value of the GI
+CDF. Each row represents a potential trajectory of the GI CDF.
+
+People are assigned a trajectory number (row number) when they are infected. This allows for each person
+to have a different GI CDF if each of the trajectories are different. However, that trajectory number will
+be used for also drawing the person's other natural history characteristics, such as their symptom onset
+and improvement times or viral load trajectory. This allows easily encoding correlation between natural
+history parameters (the user provides input CSVs where the first row in each CSV is from a joint sample
+of GI, symptom onset, symptom improvement, etc.) or allowing each of the parameters to be independent.
+
+## Assumptions
+1. There are no requirements on the number of trajectories fed to the model. Trajectory numbers are assigned
+to people uniformly and randomly. However, this means that an individual who is reinfected could have the exact
+same infectiousness trajectory as their last infection.
+2. There must be the same number of parameter sets for each parameter provided as an input CSV. For now, we are focusing
+only on GI, but we will soon expand our work to also include symptom onset and symptom improvement times.
+3. We have not yet crossed the barrier of how to separately treat individuals who are asymptomatic only. Are their
+GIs drawn from a separate CSV? Should their $R_i$ just be multiplied by a scalar? Part of the reason we are deferring
+this decision is because our previous isolation guidance work focused only on symptomatic individuals.
diff --git a/docs/viral-load-infectiousness.md b/docs/viral-load-infectiousness.md
deleted file mode 100644
index e31503d..0000000
--- a/docs/viral-load-infectiousness.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Infectiousness over time
-
-## Overview
-
-We assume that infectiousness over time varies with the viral load, similar to how we modeled infectiouness over time in our isolation guidance modeling response.
-For now, we assume that infectiousness is proportional the log of the viral load (i.e., our base assumption in our isolation guidance work),
-but we anticipate expanding to allow for other functional relationships, potentially even calibrating the functional form in our model.
-
-For each person, we draw a set of viral load parameters and associated symptom improvement time from our isolation guidance posteriors.
-We append to that a valid time to symptom onset modeled as described in Park et al., (2023) PNAS because our viral load parameters are all relative to the time of symptom onset.
-For our ABM, we need to know infectiousness over time from when an agent gets infected, so we need an agent's incubation period to appropriately shift the triangle VL curve to be relative to when an agent was infected.
-
-## Assumptions
-1. We assume that there is no infectiousness below a log VL of 0.
-2. We draw random samples from the set of triangle VL parameters for each agent, not requiring that there be as many parameter sets as there are agents.
-However, for now, the parameter sets are person-specific.
-We do this to enable being able to get the same parameter set for a person in multiple modules (we need symptom onset and improvement times in the health status module),
-but this also means that for now an agent who is reinfected will have the exact same generation interval as in their previous infection unless we account for the number of previous infections when sampling parameter sets.
-3. We do not allow for agents to be infectious before they are infected, and we do not truncate the triangle VL curve for any agents.
-Because transmission starts `peak_time - proliferation_time`  days before symptom onset, this puts a constraint on the individual's incubation period.
-We draw symptom onset times from the incubation period specified in Park et al., (2023) PNAS, and we constrain the drawn times to enforce that symptom onset happens after enough of the proliferation period has elapsed to ensure the agent experiences their full infectious course after getting infected.
-4. We have not yet crossed the barrier of how to model transmission from asymptomatic individuals, considering that our VL model was fit to only symptomatic individuals.
-One option is to still use the same model of infection but change the mean number of infection attempts such individuals have.
diff --git a/src/natural_history_manager.rs b/src/natural_history_manager.rs
index 0aad013..9750fdc 100644
--- a/src/natural_history_manager.rs
+++ b/src/natural_history_manager.rs
@@ -1,3 +1,5 @@
+use std::{fmt::Display, path::PathBuf, str::FromStr};
+
 use ixa::{
     define_data_plugin, define_person_property, define_person_property_with_default, define_rng,
     Context, ContextGlobalPropertiesExt, ContextPeopleExt, ContextRandomExt, IxaError, PersonId,
@@ -18,40 +20,97 @@ define_data_plugin!(
     NaturalHistoryParameters::default()
 );
 
-/// Read in input natural history parameter inputs and assemble valid parameter sets.
+/// Read in input natural history parameters from CSVs, validate them as valid inputs,
+/// and store them for later querying through the `ContextNaturalHistoryExt` trait.
 pub fn init(context: &mut Context) -> Result<(), IxaError> {
     // Read in the generation interval trajectories from a CSV file.
     let path = &context
         .get_global_property_value(Parameters)
         .unwrap()
         .gi_trajectories_file;
-    let mut vec_trajectories = Vec::new();
+    let gi_trajectories = read_arbitrary_column_csv::<f64>(path)?;
+    // Check that the trajectories are valid inverse CDFs.
+    check_valid_cdf(&gi_trajectories, "GI")?;
+
+    let natural_history_container = context.get_data_container_mut(NaturalHistory);
+    natural_history_container.gi_trajectories = gi_trajectories;
+    Ok(())
+}
+
+/// Read in a CSV file with an arbitrary number of columns that presumably represent a series.
+/// File should have a header, but the header is ignored. Returns a vector of the series (vectors)
+/// of type `T`.
+fn read_arbitrary_column_csv<T: FromStr>(path: &PathBuf) -> Result<Vec<Vec<T>>, IxaError>
+where
+    <T as FromStr>::Err: Display,
+{
+    let mut trajectories = Vec::new();
     let mut reader = csv::Reader::from_path(path)?;
 
     for result in reader.records() {
         let record = result?;
-        let gi = record
+        let trajectory = record
             .iter()
-            .map(|x| x.parse::<f64>().unwrap())
-            .collect::<Vec<f64>>();
-        vec_trajectories.push(gi);
+            .map(|x| {
+                x.parse::<T>()
+                    .map_err(|e| IxaError::IxaError(e.to_string()))
+            })
+            .collect::<Result<Vec<T>, _>>()?;
+        trajectories.push(trajectory);
     }
+    // The way that we've configured the CSV reader means that it will raise errors for an empty row, given that
+    // the preceding rows had data (it also raises an error for a row with a different number of columns).
+    // But, if all the rows are empty, it won't raise an error automatically.
+    if trajectories.is_empty() {
+        return Err(IxaError::IxaError(format!(
+            "No data found in file: {}",
+            path.display()
+        )));
+    }
+    Ok(trajectories)
+}
 
-    let natural_history_container = context.get_data_container_mut(NaturalHistory);
-    natural_history_container.gi_trajectories = vec_trajectories;
-    Ok(())
+/// A series of checks that ensure that trajectory in a vector of trajectories are a valid CSV.
+fn check_valid_cdf(trajectories: &[Vec<f64>], debug_parameter_name: &str) -> Result<(), IxaError> {
+    trajectories
+        .iter()
+        .try_for_each(|x: &Vec<f64>| -> Result<(), IxaError> {
+            if x.iter().any(|&y| !(0.0..=1.0).contains(&y)) {
+                return Err(IxaError::IxaError(format!(
+                    "{debug_parameter_name} CDF trajectories are not in the range [0, 1]."
+                )));
+            }
+            if x.windows(2).any(|w| w[0] > w[1]) {
+                return Err(IxaError::IxaError(format!(
+                    "{debug_parameter_name} CDF trajectories are not strictly increasing."
+                )));
+            }
+            // If we've made it this far, we know that if the first value is 1.0, all the rest are 1.0 too, and that's bad.
+            #[allow(clippy::float_cmp)]
+            if x[0] == 1.0 {
+                return Err(IxaError::IxaError(format!(
+                    "{debug_parameter_name} CDF trajectories cannot start at 1.0."
+                )));
+            }
+            Ok(())
+        })
 }
 
 define_person_property_with_default!(NaturalHistoryIdx, Option<usize>, None);
 
+/// Provides a way to interact with natural history parameters. This includes setting a natural history
+/// index for a person at the beginning of their infection and querying their natural history parameters
+/// (ex., generation interval) throughout their infection.
 pub trait ContextNaturalHistoryExt {
-    /// Set a person property that is a random index that refers to a natural history parameter set
-    /// (generation interval, symptom onset time, symptom improvement time, viral load, etc.).
+    /// Set the person property `NaturalHistoryIdx` that refers to the index of a natural history parameter set
+    /// (generation interval, symptom onset time, symptom improvement time, viral load, etc.) that will be used
+    /// throughout this person's infection. Indeces are chosen uniformly and randomly.
     fn set_natural_history_idx(&mut self, person_id: PersonId);
-    /// Estimate the inverse generation interval (i.e., time since infection at which an infection attempt happens)
-    /// from a CDF value (i.e., a value on 0 to 1 that represents the fraction of an individual's infectiousness
-    /// that has passed) for a given person based on their generation interval trajectory. Uses linear interpolation
-    /// to estimate the time from the discrete CDF samples.
+
+    /// Estimate the value of the inverse generation interval (i.e., time since infection at which an infection
+    /// attempt happens) from a CDF value (i.e., a value on 0 to 1 that represents the fraction of an individual's
+    /// infectiousness that has passed) for a given person based on their set generation interval trajectory. Uses
+    /// linear interpolation to estimate the time from the discrete CDF samples.
     fn evaluate_inverse_generation_interval(
         &self,
         person_id: PersonId,
@@ -76,13 +135,15 @@ impl ContextNaturalHistoryExt for Context {
         gi_cdf_value_unif: f64,
     ) -> f64 {
         // Let's first deal with the corner case -- the person is experiencing their first infection attempt.
-        // Linear interpolation will fail because it would try to query a value of the CDF smaller than 0.0.
-        // Instead, we return 0.0. This is the sensible value because it means that the person has
-        // experienced none of their infectiousness at the start of their infection.
+        // In this case, gi_cdf_value_unif will be 0.0. There are no points below 0.0 in a CDF, so interpolation
+        // numerically will fairl. Instead, we return 0.0. This is the obvious value because it means that the person
+        // has experienced none of their infectiousness at the start of their infection. It also ensures that if
+        // GI CDF is 0.0 for some time after the start of infection, inverse_gi(\epsilon) - inverse_gi(0) = c > 0
+        // even as \epsilon -> 0, which properly reproduces a CDF where an individual is not infectious immediately.
         if gi_cdf_value_unif == 0.0 {
             return 0.0;
         }
-        // Grab the GI trajectory for this person.
+        // Grab the set GI trajectory for this person.
         let gi_index = self
             .get_person_property(person_id, NaturalHistoryIdx)
             .expect("No GI index set. Has this person been infected?");
@@ -90,6 +151,7 @@ impl ContextNaturalHistoryExt for Context {
             .get_data_container(NaturalHistory)
             .expect("Natural history data container not initialized.");
         let gi_trajectory = &natural_history_container.gi_trajectories[gi_index];
+        // Set up what we need for interpolation.
         let dt = self
             .get_global_property_value(Parameters)
             .unwrap()
@@ -117,21 +179,25 @@ impl ContextNaturalHistoryExt for Context {
 mod test {
     use std::path::PathBuf;
 
-    use ixa::{Context, ContextGlobalPropertiesExt, ContextPeopleExt, ContextRandomExt};
+    use ixa::{Context, ContextGlobalPropertiesExt, ContextPeopleExt, ContextRandomExt, IxaError};
+    use statrs::distribution::{ContinuousCDF, Exp};
 
     use crate::{
-        natural_history_manager::init,
+        natural_history_manager::{check_valid_cdf, init},
         parameters::{Parameters, ParametersValues},
+        utils::ContextUtilsExt,
     };
 
-    use super::{ContextNaturalHistoryExt, NaturalHistoryIdx};
+    use super::{
+        read_arbitrary_column_csv, ContextNaturalHistoryExt, NaturalHistory, NaturalHistoryIdx,
+    };
 
     fn setup() -> Context {
         let params = ParametersValues {
             max_time: 10.0,
             seed: 42,
             r_0: 2.5,
-            gi_trajectories_dt: 0.1,
+            gi_trajectories_dt: 0.02,
             report_period: 1.0,
             synth_population_file: PathBuf::from("."),
             gi_trajectories_file: PathBuf::from("./tests/data/gi_trajectory.csv"),
@@ -144,6 +210,115 @@ mod test {
         context
     }
 
+    #[test]
+    fn test_empty_csv() {
+        let e = read_arbitrary_column_csv::<f64>(&PathBuf::from("./tests/data/empty.csv")).err();
+        match e {
+            Some(IxaError::IxaError(msg)) => {
+                assert_eq!(
+                    msg,
+                    "No data found in file: ./tests/data/empty.csv".to_string()
+                );
+            }
+            Some(ue) => panic!(
+                "Expected an error that file should be empty. Instead got {:?}",
+                ue.to_string()
+            ),
+            None => panic!("Expected an error. Instead, read empty file with no errors."),
+        }
+    }
+
+    #[test]
+    fn test_automatic_column_number_detection() {
+        let v = read_arbitrary_column_csv::<f64>(&PathBuf::from("./tests/data/three_columns.csv"))
+            .unwrap();
+        assert_eq!(v.len(), 1);
+        assert_eq!(v[0].len(), 3);
+    }
+
+    #[test]
+    fn test_input_out_of_cdf_range() {
+        let bad_cdf = vec![vec![0.0, 0.5, 1.1]];
+        let e = check_valid_cdf(&bad_cdf, "test").err();
+        match e {
+            Some(IxaError::IxaError(msg)) => {
+                assert_eq!(msg, "test CDF trajectories are not in the range [0, 1].".to_string());
+            }
+            Some(ue) => panic!(
+                "Expected an error that CDF values are not in the range [0, 1]. Instead got {:?}",
+                ue.to_string()
+            ),
+            None => panic!("Expected an error that CDF values are out of range. Instead, CDF validation passed with no error."),
+        }
+    }
+
+    #[test]
+    fn test_cdf_input_decreases() {
+        let bad_cdf = vec![vec![0.0, 0.5, 0.4]];
+        let e = check_valid_cdf(&bad_cdf, "test").err();
+        match e {
+            Some(IxaError::IxaError(msg)) => {
+                assert_eq!(msg, "test CDF trajectories are not strictly increasing.".to_string());
+            }
+            Some(ue) => panic!(
+                "Expected an error that CDF values are not strictly increasing. Instead got {:?}",
+                ue.to_string()
+            ),
+            None => panic!("Expected an error that CDF values are not strictly increasing. Instead, CDF validation passed with no error."),
+        }
+    }
+
+    #[test]
+    fn test_cdf_input_flat() {
+        let allowable_cdf = vec![vec![0.0, 0.0, 0.0]];
+        check_valid_cdf(&allowable_cdf, "test").unwrap();
+    }
+
+    #[test]
+    fn test_cdf_input_all_ones() {
+        let bad_cdf = vec![vec![1.0, 1.0, 1.0]];
+        let e = check_valid_cdf(&bad_cdf, "test").err();
+        match e {
+            Some(IxaError::IxaError(msg)) => {
+                assert_eq!(msg, "test CDF trajectories cannot start at 1.0.".to_string());
+            }
+            Some(ue) => panic!(
+                "Expected an error that CDF values cannot start at 1.0. Instead got {:?}",
+                ue.to_string()
+            ),
+            None => panic!("Expected an error that CDF values cannot start at 1.0. Instead, CDF validation passed with no error."),
+        }
+    }
+
+    #[test]
+    #[allow(clippy::cast_precision_loss)]
+    fn test_natural_history_init() {
+        // Check that the trajectory at this index is the toy GI we fed in -- CDF of exponential distribution
+        // with rate 1.
+        let mut context = setup();
+        init(&mut context).unwrap();
+        let gi_trajectory = &context
+            .get_data_container(NaturalHistory)
+            .unwrap()
+            .gi_trajectories;
+        assert_eq!(gi_trajectory.len(), 1);
+        let cdf = |x| Exp::new(1.0).unwrap().cdf(x);
+        let dt = context
+            .get_global_property_value(Parameters)
+            .unwrap()
+            .gi_trajectories_dt;
+        let expected_trajectory = (0..gi_trajectory[0].len())
+            .map(|x| cdf(x as f64 * dt))
+            .collect::<Vec<f64>>();
+        let diff = gi_trajectory[0]
+            .iter()
+            .zip(expected_trajectory.iter())
+            .map(|(x, y)| (x - y).abs())
+            .collect::<Vec<f64>>();
+        // Small differences due to R vs Rust's algorithm for calculating the value of the CDF at the tails.
+        assert!(diff.iter().all(|&x| x < f64::EPSILON));
+    }
+
     #[test]
     fn test_set_natural_history_idx() {
         let mut context = setup();
@@ -153,17 +328,72 @@ mod test {
         let gi_index = context.get_person_property(person_id, NaturalHistoryIdx);
         match gi_index {
             Some(0) => (),
-            Some(idx) => panic!("Wrong GI index set. Should be 1, but is {idx}."),
+            Some(idx) => panic!("Wrong GI index set. Should be 0, but is {idx}."),
             None => panic!("Should not panic. NH index should be set."),
         }
     }
 
     #[test]
+    #[allow(clippy::float_cmp)]
     fn test_evaluate_inverse_generation_interval() {
         let mut context = setup();
         init(&mut context).unwrap();
+        let dt = context
+            .get_global_property_value(Parameters)
+            .unwrap()
+            .gi_trajectories_dt;
         let person_id = context.add_person(()).unwrap();
         context.set_natural_history_idx(person_id);
-        context.evaluate_inverse_generation_interval(person_id, 0.5);
+        // Check that a CDF value of 0.0 returns a time of 0.0.
+        assert_eq!(
+            context.evaluate_inverse_generation_interval(person_id, 0.0),
+            0.0
+        );
+        // Check some values of the inverse CDF.
+        let cdf = |x| Exp::new(1.0).unwrap().cdf(x);
+        // No interpolation required because we pick an integer increment of dt.
+        assert!(
+            (context.evaluate_inverse_generation_interval(person_id, cdf(10.0 * dt)) - 10.0 * dt)
+                .abs()
+                < f64::EPSILON
+        );
+        // Interpolation required.
+        // Test values for small, middle, and large dt.
+        assert!(
+            (context.evaluate_inverse_generation_interval(person_id, cdf(9.8 * dt))
+                - context.linear_interpolation(
+                    cdf(9.0 * dt),
+                    cdf(10.0 * dt),
+                    9.0 * dt,
+                    10.0 * dt,
+                    cdf(9.8 * dt)
+                ))
+            .abs()
+                < f64::EPSILON
+        );
+        assert!(
+            (context.evaluate_inverse_generation_interval(person_id, cdf(159.5 * dt))
+                - context.linear_interpolation(
+                    cdf(159.0 * dt),
+                    cdf(160.0 * dt),
+                    159.0 * dt,
+                    160.0 * dt,
+                    cdf(159.5 * dt)
+                ))
+            .abs()
+                < f64::EPSILON
+        );
+        assert!(
+            (context.evaluate_inverse_generation_interval(person_id, cdf(389.2 * dt))
+                - context.linear_interpolation(
+                    cdf(389.0 * dt),
+                    cdf(390.0 * dt),
+                    389.0 * dt,
+                    390.0 * dt,
+                    cdf(389.2 * dt)
+                ))
+            .abs()
+                < f64::EPSILON
+        );
     }
 }
diff --git a/src/transmission_manager.rs b/src/transmission_manager.rs
index 8c03fc3..21ead74 100644
--- a/src/transmission_manager.rs
+++ b/src/transmission_manager.rs
@@ -8,8 +8,10 @@ use ixa::{
 };
 use statrs::distribution::Poisson;
 
-use crate::{contact::ContextContactExt, population_loader::Alive};
-use crate::{natural_history_manager::ContextNaturalHistoryExt, parameters::Parameters};
+use crate::{
+    contact::ContextContactExt, natural_history_manager::ContextNaturalHistoryExt,
+    parameters::Parameters, population_loader::Alive,
+};
 
 // Define the possible infectious statuses for a person.
 // These states refer to the person's infectiousness at a given time
@@ -113,9 +115,9 @@ fn schedule_next_infection_attempt(
     // Get the next infection attempt time.
     let (next_infection_time_unif, time_until_next_infection_attempt_gi) = get_next_infection_time(
         context,
+        transmitter_id,
         num_infection_attempts_remaining,
         last_infection_time_unif,
-        transmitter_id,
     );
 
     // Schedule the infection attempt. The function `infection_attempt` (a) grabs a contact at
@@ -144,9 +146,9 @@ fn schedule_next_infection_attempt(
 /// and passes it through the inverse CDF of the generation interval to get the next infection time.
 fn get_next_infection_time(
     context: &mut Context,
+    transmitter_id: PersonId,
     num_infection_attempts_remaining: usize,
     last_infection_time_unif: f64,
-    transmitter_id: PersonId,
 ) -> (f64, f64) {
     // Draw the next uniform infection time using order statistics.
     // The first of n draws from U(0, 1) comes from Beta(1, n), so we pass a uniform
@@ -161,7 +163,6 @@ fn get_next_infection_time(
     // so that the next infection time is always greater than the last infection time.
     next_infection_time_unif =
         last_infection_time_unif + next_infection_time_unif * (1.0 - last_infection_time_unif);
-    assert!(next_infection_time_unif > last_infection_time_unif);
 
     let delta_time = context
         .evaluate_inverse_generation_interval(transmitter_id, next_infection_time_unif)
@@ -380,6 +381,9 @@ mod test {
             let mut context = setup(1.0);
             context.init_random(seed.into());
             let transmitter_id = context.add_person(()).unwrap();
+            // Since we manually trigger the `schedule_next_infection_attempt` chain,
+            // we need to assign a natural history index to the transmitter -- this would have been
+            // done in the `handle_infectious_status_change` function, but we do not call it explicitly here.
             context.set_natural_history_idx(transmitter_id);
             // Create a person who will be the only contact, but have them be dead so they can't be infected.
             // Instead, `get_contact` will return None.
diff --git a/tests/data/empty.csv b/tests/data/empty.csv
new file mode 100644
index 0000000..9b2cddf
--- /dev/null
+++ b/tests/data/empty.csv
@@ -0,0 +1 @@
+0,0.02,0.04
diff --git a/tests/data/three_columns.csv b/tests/data/three_columns.csv
new file mode 100644
index 0000000..79f1fa0
--- /dev/null
+++ b/tests/data/three_columns.csv
@@ -0,0 +1,2 @@
+0,0.02,0.04
+0,0.0198013266932447,0.03921056084767679