From 054c2efdd6de132f50ebaf5b59de7f295ffbc8ac Mon Sep 17 00:00:00 2001
From: Alexander Held <45009355+alexander-held@users.noreply.github.com>
Date: Fri, 1 Oct 2021 14:29:46 +0200
Subject: [PATCH] feat: utility to match fit results to arbitrary model (#288)

* added model_utils.match_fit_results function to match fit results to different model
* switched from use of np.diag to np.diagflat
* added warning to model_utils.prediction for mismatch of parameter names in fit results and model
* updated pre-commit
---
 .pre-commit-config.yaml      |  2 +-
 README.md                    |  4 +-
 src/cabinetry/model_utils.py | 72 ++++++++++++++++++++++++++++++-
 tests/test_model_utils.py    | 83 +++++++++++++++++++++++++++++++++---
 4 files changed, 152 insertions(+), 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1bcebd5b..6486c5f4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
     -   id: flake8
         additional_dependencies: [flake8-bugbear, flake8-import-order, flake8-print]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.26.0
+    rev: v2.29.0
     hooks:
     -   id: pyupgrade
         args: ["--py37-plus"]
diff --git a/README.md b/README.md
index 042d7ffc..949f9c57 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
 
 [![CI status](https://github.com/scikit-hep/cabinetry/workflows/CI/badge.svg)](https://github.com/scikit-hep/cabinetry/actions?query=workflow%3ACI)
 [![Documentation Status](https://readthedocs.org/projects/cabinetry/badge/?version=latest)](https://cabinetry.readthedocs.io/en/latest/?badge=latest)
-[![codecov](https://codecov.io/gh/scikit-hep/cabinetry/branch/master/graph/badge.svg)](https://codecov.io/gh/scikit-hep/cabinetry)
+[![Codecov](https://codecov.io/gh/scikit-hep/cabinetry/branch/master/graph/badge.svg)](https://codecov.io/gh/scikit-hep/cabinetry)
 [![PyPI version](https://badge.fury.io/py/cabinetry.svg)](https://badge.fury.io/py/cabinetry)
-[![python version](https://img.shields.io/pypi/pyversions/cabinetry.svg)](https://pypi.org/project/cabinetry/)
+[![Python version](https://img.shields.io/pypi/pyversions/cabinetry.svg)](https://pypi.org/project/cabinetry/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4742752.svg)](https://doi.org/10.5281/zenodo.4742752)
diff --git a/src/cabinetry/model_utils.py b/src/cabinetry/model_utils.py
index b62bb90f..b9ed58b1 100644
--- a/src/cabinetry/model_utils.py
+++ b/src/cabinetry/model_utils.py
@@ -269,7 +269,7 @@ def yield_stdev(
 
     labels = model.config.par_names()
     # continue with off-diagonal contributions if there are any
-    if np.count_nonzero(corr_mat - np.diag(np.ones_like(parameters))) > 0:
+    if np.count_nonzero(corr_mat - np.diagflat(np.ones_like(parameters))) > 0:
         # loop over pairs of parameters
         for i_par in range(model.config.npars):
             for j_par in range(model.config.npars):
@@ -336,6 +336,8 @@ def prediction(
         ModelPrediction: model, yields and uncertainties per bin and channel
     """
     if fit_results is not None:
+        if fit_results.labels != model.config.par_names():
+            log.warning("parameter names in fit results and model do not match")
         # fit results specified, so they are used
         param_values = fit_results.bestfit
         param_uncertainty = fit_results.uncertainty
@@ -484,3 +486,71 @@ def _filter_channels(
         )
 
     return filtered_channels
+
+
+def match_fit_results(model: pyhf.pdf.Model, fit_results: FitResults) -> FitResults:
+    """Matches results from a fit to a model by adding or removing parameters as needed.
+
+    If the fit results contain parameters missing in the model, these parameters are not
+    included in the returned fit results. If the fit results do not include parameters
+    used in the model, they are added to the fit results. The best-fit value for such
+    parameters are the Asimov values as returned by ```asimov_parameters``` (initial
+    parameter settings for unconstrained parameters), and the associated uncertainties
+    as given by ```prefit_uncertainties``` (zero uncertainty for unconstrained or fixed
+    parameters). These parameters furthermore are assumed to have no correlation with
+    any other parameters. If required, parameters are re-ordered to match the target
+    model.
+
+    Args:
+        model (pyhf.pdf.Model): model to match fit results to
+        fit_results (FitResults): fit results to be updated in order to match model
+
+    Returns:
+        FitResults: fit results matching the model
+    """
+    # start with the assumption that the provided fit results contain no relevant
+    # information for the target model at all, and initialize everything accordingly
+    # then inject information contained in provided fit results and return the new
+    # fit results matching the target model at the end
+
+    bestfit = asimov_parameters(model)  # Asimov parameter values for target model
+    uncertainty = prefit_uncertainties(model)  # pre-fit uncertainties for target model
+    labels = model.config.par_names()  # labels for target model
+
+    # indices of parameters in current fit results, or None if they are missing
+    indices_for_corr: List[Optional[int]] = [None] * len(labels)
+
+    # loop over all required parameters
+    for target_idx, target_label in enumerate(labels):
+        # if parameters are missing in fit results, no further action is needed here
+        # as all relevant objects have been initialized with that assumption
+        if target_label in fit_results.labels:
+            # fit results contain parameter, find its position
+            idx_in_fit_results = fit_results.labels.index(target_label)
+            # override pre-fit values by actual fit results for parameters
+            bestfit[target_idx] = fit_results.bestfit[idx_in_fit_results]
+            uncertainty[target_idx] = fit_results.uncertainty[idx_in_fit_results]
+            # update indices for correlation matrix reconstruction
+            indices_for_corr[target_idx] = idx_in_fit_results
+
+    # re-build correlation matrix: start with diagonal matrix (assuming fit results do
+    # not contain relevant info), and then insert values provided in fit results
+    corr_mat = np.diagflat(np.ones_like(labels, dtype=float))
+    for i_target, i_orig in enumerate(indices_for_corr):
+        for j_target, j_orig in enumerate(indices_for_corr):
+            # i_target and j_target are positions in matched correlation matrix
+            # i_orig and j_orig are positions in old matrix, None if missing from there
+            if i_orig is not None and j_orig is not None:
+                # if i_orig or j_orig are None, one of the parameters are not part of
+                # original fit result, and no update to correlation matrix is needed
+                corr_mat[i_target][j_target] = fit_results.corr_mat[i_orig][j_orig]
+
+    fit_results_matched = FitResults(
+        np.asarray(bestfit),
+        np.asarray(uncertainty),
+        labels,
+        corr_mat,
+        fit_results.best_twice_nll,
+        fit_results.goodness_of_fit,
+    )
+    return fit_results_matched
diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py
index 4bb12425..86cc28e0 100644
--- a/tests/test_model_utils.py
+++ b/tests/test_model_utils.py
@@ -164,7 +164,7 @@ def test_yield_stdev(example_spec, example_spec_multibin):
     # pre-fit
     parameters = np.asarray([1.0, 1.0])
     uncertainty = np.asarray([0.0495665682, 0.0])
-    diag_corr_mat = np.diag([1.0, 1.0])
+    diag_corr_mat = np.diagflat([1.0, 1.0])
     total_stdev_bin, total_stdev_chan = model_utils.yield_stdev(
         model, parameters, uncertainty, diag_corr_mat
     )
@@ -213,7 +213,8 @@ def test_yield_stdev(example_spec, example_spec_multibin):
     "cabinetry.model_utils.prefit_uncertainties", return_value=([0.04956657, 0.0])
 )
 @mock.patch("cabinetry.model_utils.asimov_parameters", return_value=([1.0, 1.0]))
-def test_prediction(mock_asimov, mock_unc, mock_stdev, example_spec):
+def test_prediction(mock_asimov, mock_unc, mock_stdev, caplog, example_spec):
+    caplog.set_level(logging.DEBUG)
     model = pyhf.Workspace(example_spec).model()
 
     # pre-fit prediction
@@ -242,7 +243,7 @@ def test_prediction(mock_asimov, mock_unc, mock_stdev, example_spec):
     fit_results = FitResults(
         np.asarray([1.01, 1.1]),
         np.asarray([0.03, 0.1]),
-        [],
+        ["staterror_Signal-Region[0]", "Signal strength"],
         np.asarray([[1.0, 0.2], [0.2, 1.0]]),
         0.0,
     )
@@ -252,6 +253,9 @@ def test_prediction(mock_asimov, mock_unc, mock_stdev, example_spec):
     assert model_pred.total_stdev_model_bins == [[0.3]]  # from mock
     assert model_pred.total_stdev_model_channels == [0.3]  # from mock
     assert model_pred.label == "post-fit"
+    assert "parameter names in fit results and model do not match" not in [
+        rec.message for rec in caplog.records
+    ]
 
     assert mock_asimov.call_count == 1  # no new call
     assert mock_unc.call_count == 1  # no new call
@@ -266,9 +270,22 @@ def test_prediction(mock_asimov, mock_unc, mock_stdev, example_spec):
     )
     assert mock_stdev.call_args_list[1][1] == {}
 
-    # custom label
-    model_pred = model_utils.prediction(model, label="abc")
+    caplog.clear()
+
+    # custom prediction label, mis-match in parameter names
+    fit_results = FitResults(
+        np.asarray([1.01, 1.1]),
+        np.asarray([0.03, 0.1]),
+        ["a", "b"],
+        np.asarray([[1.0, 0.2], [0.2, 1.0]]),
+        0.0,
+    )
+    model_pred = model_utils.prediction(model, fit_results=fit_results, label="abc")
+    assert "parameter names in fit results and model do not match" in [
+        rec.message for rec in caplog.records
+    ]
     assert model_pred.label == "abc"
+    caplog.clear()
 
 
 def test_unconstrained_parameter_count(example_spec, example_spec_shapefactor):
@@ -342,3 +359,59 @@ def test__filter_channels(example_spec_multibin, caplog):
         "['region_1', 'region_2']" in [rec.message for rec in caplog.records]
     )
     caplog.clear()
+
+
+@mock.patch(
+    "cabinetry.model_utils.prefit_uncertainties",
+    side_effect=[[0.4, 0.5, 0.6], [0.4, 0.5], [0.4, 0.5, 0.6]],
+)
+@mock.patch(
+    "cabinetry.model_utils.asimov_parameters",
+    side_effect=[[4.0, 5.0, 6.0], [4.0, 5.0], [4.0, 5.0, 6.0]],
+)
+def test_match_fit_results(mock_pars, mock_uncs):
+    mock_model = mock.MagicMock()
+    fit_results = FitResults(
+        np.asarray([1.0, 2.0, 3.0]),
+        np.asarray([0.1, 0.2, 0.3]),
+        ["par_a", "par_b", "par_c"],
+        np.asarray([[1.0, 0.2, 0.5], [0.2, 1.0, 0.1], [0.5, 0.1, 1.0]]),
+        5.0,
+        0.1,
+    )
+
+    # remove par_a, flip par_b and par_c, add par_d
+    mock_model.config.par_names.return_value = ["par_c", "par_d", "par_b"]
+    matched_fit_res = model_utils.match_fit_results(mock_model, fit_results)
+    assert mock_pars.call_args_list == [((mock_model,), {})]
+    assert mock_uncs.call_args_list == [((mock_model,), {})]
+    assert np.allclose(matched_fit_res.bestfit, [3.0, 5.0, 2.0])
+    assert np.allclose(matched_fit_res.uncertainty, [0.3, 0.5, 0.2])
+    assert matched_fit_res.labels == ["par_c", "par_d", "par_b"]
+    assert np.allclose(
+        matched_fit_res.corr_mat, [[1.0, 0.0, 0.1], [0.0, 1.0, 0.0], [0.1, 0.0, 1.0]]
+    )
+    assert matched_fit_res.best_twice_nll == 5.0
+    assert matched_fit_res.goodness_of_fit == 0.1
+
+    # all parameters are new
+    mock_model.config.par_names.return_value = ["par_d", "par_e"]
+    matched_fit_res = model_utils.match_fit_results(mock_model, fit_results)
+    assert np.allclose(matched_fit_res.bestfit, [4.0, 5.0])
+    assert np.allclose(matched_fit_res.uncertainty, [0.4, 0.5])
+    assert matched_fit_res.labels == ["par_d", "par_e"]
+    assert np.allclose(matched_fit_res.corr_mat, [[1.0, 0.0], [0.0, 1.0]])
+    assert matched_fit_res.best_twice_nll == 5.0
+    assert matched_fit_res.goodness_of_fit == 0.1
+
+    # fit results already match model exactly
+    mock_model.config.par_names.return_value = ["par_a", "par_b", "par_c"]
+    matched_fit_res = model_utils.match_fit_results(mock_model, fit_results)
+    assert np.allclose(matched_fit_res.bestfit, [1.0, 2.0, 3.0])
+    assert np.allclose(matched_fit_res.uncertainty, [0.1, 0.2, 0.3])
+    assert matched_fit_res.labels == ["par_a", "par_b", "par_c"]
+    assert np.allclose(
+        matched_fit_res.corr_mat, [[1.0, 0.2, 0.5], [0.2, 1.0, 0.1], [0.5, 0.1, 1.0]]
+    )
+    assert matched_fit_res.best_twice_nll == 5.0
+    assert matched_fit_res.goodness_of_fit == 0.1