Issue 537: Fix making analysis dataframes and record priors (#535)

* Update changelog.md * Update make_model_priors.jl * Make `define_epiprob` more modular _make_epidata can also be reused elsewhere * restructure analysis dataframe func * fix make_prediction_dataframe_from_output And add simple unit test with committed test data * Revert "fix make_prediction_dataframe_from_output" This reverts commit 7e6238b. * Reapply "fix make_prediction_dataframe_from_output" This reverts commit 14d29b4.
CDCgov · Dec 9, 2024 · 8c05d03 · 8c05d03
1 parent 032457f
commit 8c05d03
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -393,3 +393,6 @@ EpiAware/docs/src/getting-started/tutorials/censored-obs.md
 EpiAware/docs/Manifest*.toml
 
 !benchmark/Manifest.toml
+
+# Test data
+!pipeline/test/analysis/test_data.jld2
diff --git a/manuscript/changelog.md b/manuscript/changelog.md
@@ -2,3 +2,37 @@
 
 ## 16th Aug 2024
 Changed the proposed latent processes to daily rather than weekly incremental changes.
+
+## 5th Dec 2024
+Changed priors to
+
+```julia
+"""
+Constructs and returns a dictionary of prior distributions for the latent model
+parameters. This is the default method.
+
+# Arguments
+- `pipeline`: An instance of the `AbstractEpiAwarePipeline` type.
+
+# Returns
+A dictionary containing the following prior distributions:
+- `"transformed_process_init_prior"`: A normal distribution with mean 0.0 and
+standard deviation 0.25.
+- `"std_prior"`: A half-normal distribution with standard deviation 0.25.
+- `"damp_param_prior"`: A beta distribution with shape parameters 0.5 and 0.5.
+
+"""
+function make_model_priors(pipeline::AbstractEpiAwarePipeline)
+    transformed_process_init_prior = Normal(0.0, 0.25)
+    std_prior = HalfNormal(0.025)
+    damp_param_prior = Beta(1, 9)
+    log_I0_prior = Normal(log(100.0), 1e-1)
+
+    return Dict(
+        "transformed_process_init_prior" => transformed_process_init_prior,
+        "std_prior" => std_prior,
+        "damp_param_prior" => damp_param_prior,
+        "log_I0_prior" => log_I0_prior
+    )
+end
+```
diff --git a/pipeline/src/analysis/make_prediction_dataframe_from_output.jl b/pipeline/src/analysis/make_prediction_dataframe_from_output.jl
@@ -3,7 +3,9 @@ Create a dataframe containing prediction results based on the given output and i
 
 # Arguments
 - `filename`: The name of the file.
-- `output`: The output data containing inference configuration, IGP model, and other information.
+- `output`: The output data containing inference configuration, IGP model, and other
+using Base: infer_effects
+    information.
 - `epi_datas`: The input data for the epidemiological model.
 - `qs`: An optional array of quantiles to calculate. Default is `[0.025, 0.5, 0.975]`.
 
@@ -12,53 +14,57 @@ A dataframe containing the prediction results.
 
 """
 function make_prediction_dataframe_from_output(
-        filename, output, epi_datas, pipelines; qs = [0.025, 0.5, 0.975])
-    #Get the scenario, IGP model, latent model and true mean GI
+        output, true_mean_gi; qs = [0.025, 0.25, 0.5, 0.75, 0.975],
+        transformation = oneexpy)
+    #Unpack the output
     inference_config = output["inference_config"]
-    igp_model = output["inference_config"].igp |> string
-    scenario = EpiAwarePipeline._get_scenario_from_filename(filename, pipelines)
-    latent_model = EpiAwarePipeline._get_latent_model_from_filename(filename)
-    true_mean_gi = EpiAwarePipeline._get_true_gi_mean_from_filename(filename)
+    forecasts = output["forecast_results"]
+    #Get the scenario, IGP model, latent model and true mean GI
+    igp_model = inference_config["igp"] |> igp_name -> split(igp_name, ".")[end]
+    scenario = inference_config["scenario"]
+    latent_model = inference_config["latent_model"]
+    used_gi_mean = inference_config["gi_mean"]
+    used_gi_std = inference_config["gi_std"]
+    (start_time, reference_time) = inference_config["tspan"] |>
+                                   tspan -> split(tspan, "_") |>
+                                            tspan -> (
+        parse(Int, tspan[1]), parse(Int, tspan[2]))
 
     #Get the quantiles for the targets across the gi mean scenarios
     #if Renewal model, then we use the underlying epi model
     #otherwise we use the epi datas to loop over different gi mean implications
-    used_epi_datas = igp_model == "Renewal" ? [output["epiprob"].epi_model.data] : epi_datas
+    used_gi_means = igp_model == "Renewal" ?
+                    [used_gi_mean] :
+                    make_gi_params(EpiAwareExamplePipeline())["gi_means"]
 
-    preds = nothing
-    try
-        preds = map(used_epi_datas) do epi_data
-            generate_quantiles_for_targets(output, epi_data, qs)
-        end
-        used_gi_means = igp_model == "Renewal" ?
-                        [EpiAwarePipeline._get_used_gi_mean_from_filename(filename)] :
-                        make_gi_params(EpiAwareExamplePipeline())["gi_means"]
+    used_epidatas = map(used_gi_means) do ḡ
+        _make_epidata(ḡ, used_gi_std; transformation = transformation)
+    end
+
+    preds = map(used_epidatas) do epi_data
+        generate_quantiles_for_targets(forecasts, epi_data, qs)
+    end
 
-        #Create the dataframe columnwise
-        df = mapreduce(vcat, preds, used_gi_means) do pred, used_gi_mean
-            mapreduce(vcat, keys(pred)) do target
-                target_mat = pred[target]
-                target_times = collect(1:size(target_mat, 1)) .+
-                               (inference_config.tspan[1] - 1)
-                _df = DataFrame(target_times = target_times)
-                _df[!, "Scenario"] .= scenario
-                _df[!, "IGP_Model"] .= igp_model
-                _df[!, "Latent_Model"] .= latent_model
-                _df[!, "True_GI_Mean"] .= true_mean_gi
-                _df[!, "Used_GI_Mean"] .= used_gi_mean
-                _df[!, "Reference_Time"] .= inference_config.tspan[2]
-                _df[!, "Target"] .= string(target)
-                # quantile predictions
-                for (j, q) in enumerate(qs)
-                    q_str = split(string(q), ".")[end]
-                    _df[!, "q_$(q_str)"] = target_mat[:, j]
-                end
-                return _df
+    #Create the dataframe columnwise
+    df = mapreduce(vcat, preds, used_gi_means) do pred, used_gi_mean
+        mapreduce(vcat, keys(pred)) do target
+            target_mat = pred[target]
+            target_times = collect(1:size(target_mat, 1)) .+ (start_time - 1)
+            _df = DataFrame(target_times = target_times)
+            _df[!, "Scenario"] .= scenario
+            _df[!, "IGP_Model"] .= igp_model
+            _df[!, "Latent_Model"] .= latent_model
+            _df[!, "True_GI_Mean"] .= true_mean_gi
+            _df[!, "Used_GI_Mean"] .= used_gi_mean
+            _df[!, "Reference_Time"] .= reference_time
+            _df[!, "Target"] .= string(target)
+            # quantile predictions
+            for (j, q) in enumerate(qs)
+                q_str = split(string(q), ".")[end]
+                _df[!, "q_$(q_str)"] = target_mat[:, j]
             end
+            return _df
         end
-        return df
-    catch
-        @warn "Error in generating quantiles for targets in file $filename"
-        return nothing
     end
+    return df
 end
diff --git a/pipeline/src/constructors/make_model_priors.jl b/pipeline/src/constructors/make_model_priors.jl
@@ -8,16 +8,18 @@ parameters. This is the default method.
 # Returns
 A dictionary containing the following prior distributions:
 - `"transformed_process_init_prior"`: A normal distribution with mean 0.0 and
-standard deviation 0.25.
-- `"std_prior"`: A half-normal distribution with standard deviation 0.25.
-- `"damp_param_prior"`: A beta distribution with shape parameters 0.5 and 0.5.
+standard deviation 0.025.
+- `"std_prior"`: A half-normal distribution with standard deviation 0.025.
+- `"damp_param_prior"`: A beta distribution with shape parameters 1 and 9.
+- `"log_I0_prior"`: A normal distribution with mean log(100.0) and standard
+deviation 1e-1.
 
 """
 function make_model_priors(pipeline::AbstractEpiAwarePipeline)
     transformed_process_init_prior = Normal(0.0, 0.25)
-    std_prior = HalfNormal(0.25)
-    damp_param_prior = Beta(0.5, 0.5)
-    log_I0_prior = Normal(log(100.0), 1e-5)
+    std_prior = HalfNormal(0.025)
+    damp_param_prior = Beta(1, 9)
+    log_I0_prior = Normal(log(100.0), 1e-1)
 
     return Dict(
         "transformed_process_init_prior" => transformed_process_init_prior,

diff --git a/pipeline/src/infer/define_epiprob.jl b/pipeline/src/infer/define_epiprob.jl
@@ -1,3 +1,28 @@
+"""
+Create an `EpiData` object based on the provided generation interval mean and standard
+deviation by assuming a gamma distribution.
+
+# Arguments
+- `gi_mean::Float64`: Mean of the generation interval.
+- `gi_std::Float64`: Standard deviation of the generation interval.
+- `transformation`: A transformation function to be applied
+    (default is `EpiAwarePipeline.oneexpy` a custom implementation of `exp`).
+
+# Returns
+- `model_data::EpiData`: An `EpiData` object containing the generation interval distribution
+    and transformation.
+
+"""
+function _make_epidata(gi_mean, gi_std; transformation = EpiAwarePipeline.oneexpy)
+    shape = (gi_mean / gi_std)^2
+    scale = gi_std^2 / gi_mean
+    gen_distribution = Gamma(shape, scale)
+
+    model_data = EpiData(
+        gen_distribution = gen_distribution, transformation = transformation)
+    return model_data
+end
+
 """
 Create an `EpiProblem` object based on the provided `InferenceConfig`.
 
@@ -9,12 +34,8 @@ Create an `EpiProblem` object based on the provided `InferenceConfig`.
 
 """
 function define_epiprob(config::InferenceConfig)
-    shape = (config.gi_mean / config.gi_std)^2
-    scale = config.gi_std^2 / config.gi_mean
-    gen_distribution = Gamma(shape, scale)
-
-    model_data = EpiData(
-        gen_distribution = gen_distribution, transformation = config.transformation)
+    model_data = _make_epidata(
+        config.gi_mean, config.gi_std; transformation = config.transformation)
     #Build the epidemiological model
     epi = config.igp(data = model_data, initialisation_prior = config.log_I0_prior)
 

diff --git a/pipeline/test/Project.toml b/pipeline/test/Project.toml
@@ -2,9 +2,11 @@
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DrWatson = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
 EpiAware = "b2eeebe4-5992-4301-9193-7ebc9f62c855"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d"
 RCall = "6f49c342-dc21-5d91-9882-a32aef131414"

diff --git a/pipeline/test/analysis/make_prediction_dataframe_from_output.jl b/pipeline/test/analysis/make_prediction_dataframe_from_output.jl
@@ -0,0 +1,20 @@
+@testset "test dataframe construct for one dataset" begin
+    using JLD2, DataFramesMeta
+    output = load(joinpath(@__DIR__(), "test_data.jld2"))
+    true_mean_gi = 10.0
+
+    df = make_prediction_dataframe_from_output(output, true_mean_gi)
+    @test !isempty(df)
+    @test "Scenario" in names(df)
+    @test "IGP_Model" in names(df)
+    @test "Latent_Model" in names(df)
+    @test "True_GI_Mean" in names(df)
+    @test "Used_GI_Mean" in names(df)
+    @test "Reference_Time" in names(df)
+    @test "Target" in names(df)
+    @test "q_025" in names(df)
+    @test "q_25" in names(df)
+    @test "q_5" in names(df)
+    @test "q_75" in names(df)
+    @test "q_975" in names(df)
+end
diff --git a/pipeline/test/analysis/test_analysis.jl b/pipeline/test/analysis/test_analysis.jl
@@ -0,0 +1 @@
+include("make_prediction_dataframe_from_output.jl")
diff --git a/pipeline/test/analysis/test_data.jld2 b/pipeline/test/analysis/test_data.jld2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		include("make_prediction_dataframe_from_output.jl")