From 91b1ee5453e2f5b7a6c15064e0462f36e033cf7e Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 15:55:15 +0100
Subject: [PATCH 01/14] Init analysis

---
 validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 295bcd30fb..f4869367c4 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -97,6 +97,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True):
 
 all_kinlimits = collect(kinlimits, ('dataset_inputs',))
 
+def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
+                        _internal_max_reps=None,
+                        _internal_min_reps=20):
+    """
+    Load in a dictionary all the specs of a dataset meaning:
+    - ds name
+    - ds coords
+    - standard deviation (in multiclosure)
+    - mean (in multiclosure again)
+    - (x,Q^2) coords
+    """
+    xq2_map_obj = xq2map_with_cuts(commondata, cuts)
+    coords = xq2_map_obj[2]
+    central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
+    std_devs = np.std(central_deltas, axis = 0)
+    means = np.mean(central_deltas, axis = 0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+    #import ipdb; ipdb.set_trace()
+    # for case of DY observables we have 2 (x,Q) for each experimental point
+    if coords[0].shape[0] != std_devs.shape[0]:
+        std_devs = np.concatenate((std_devs,std_devs))
+        means = np.concatenate((means,means))
+        xi = np.concatenate((xi,xi))
+    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
 
 @table
 def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True):
@@ -186,6 +210,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None):
         'data_input',
     ),
 )
+xq2_data_map = collect("xq2_dataset_map",("data",))
+
+@figuregen
+def xq2_data_prcs_maps(xq2_data_map):
+    import matplotlib.pyplot as plt
+    import matplotlib.colors
+    from matplotlib.colors import ListedColormap
+    keys = ["std_devs","xi"]
+    for elem in xq2_data_map:
+        for k in keys:
+            fig, ax = plotutils.subplots()
+            im = ax.scatter(elem['x_coords'],elem['Q_coords'],
+                                c=(np.asarray(elem[k])), 
+                                cmap='viridis',
+                                label = elem['name'])
+            fig.colorbar(im,label=k)
+            ax.set_xscale('log')  # Set x-axis to log scale
+            ax.set_yscale('log')  # Set y-axis to log scale
+            ax.set_xlabel('x')
+            ax.set_ylabel('Q2')
+            ax.set_title(elem["name"]+" "+k)
+            yield fig
 
 
 def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False):

From ab5d6d14d6ad1c1ebe6c440c9f31988422847f49 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:15:05 +0100
Subject: [PATCH 02/14] Add other functions

---
 .../multiclosure_inconsistent.py              | 362 ++++++++++++++++++
 .../multiclosure_inconsistent_output.py       |  54 +++
 .../src/validphys/closuretest/multiclosure.py |  35 ++
 validphys2/src/validphys/kinematics.py        |   7 +-
 4 files changed, 457 insertions(+), 1 deletion(-)
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
new file mode 100644
index 0000000000..4cb12f2d30
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -0,0 +1,362 @@
+"""
+closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+
+Module containing all of the statistical estimators which are
+averaged across multiple inconsistent fits. The actions
+in this module are used to produce results which are plotted in
+``multiclosure_inconsistent_output.py``
+
+"""
+
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+
+from validphys import covmats
+from validphys.calcutils import calc_chi2
+from validphys.results import ThPredictionsResult
+
+from reportengine import collect
+
+
+""" To load several multiclosure fits. Useful for inconsistent closure test analysis """
+multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",))
+
+multi_dataset_fits_bias_replicas_variance_samples = collect(
+    "dataset_fits_bias_replicas_variance_samples", ("dataspecs",)
+)
+
+multi_fits_bootstrap_dataset_bias_variance = collect(
+    "fits_bootstrap_dataset_bias_variance", ("dataspecs",)
+)
+
+multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",))
+
+multi_dataset_fits_bias_variance_samples_pca = collect(
+    "dataset_fits_bias_variance_samples_pca", ("dataspecs",)
+)
+
+
+def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93):
+    """
+    Compute the principal components of theory predictions replica matrix
+    (Ndat x Nrep feature matrix).
+
+    Parameters
+    ----------
+    dataset: (DataSetSpec, DataGroupSpec)
+        dataset for which the theory predictions and t0 covariance matrix
+        will be loaded. Note that due to the structure of `validphys` this
+        function can be overloaded to accept a DataGroupSpec.
+
+    fits_pdf: list
+        list of PDF objects produced from performing multiple closure tests
+        fits. Each fit should have a different filterseed but the same
+        underlying law used to generate the pseudodata.
+
+    variancepdf: validphys.core.PDF
+            PDF object used to estimate the variance of the fits.
+
+    explained_variance_ratio: float, default is 0.93
+
+    Returns
+    -------
+    tuple
+        2D tuple:
+        - matrix of the principal components (PCs) of shape (N_pc, N_dat)
+        - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
+
+    """
+    # fits_dataset_predictions = [
+    #     ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf
+    # ]
+
+    # dimensions here are (Nfits, Ndat, Nrep)
+    # reps = np.asarray([th.error_members for th in fits_dataset_predictions])
+
+    # reshape so as to get PCs from all the samples
+    # reps = reps.reshape(reps.shape[1],-1)
+
+    # get replicas from variance fit, used to estimate variance
+    reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
+
+    # rescale feature matrix
+    reps_scaled = reps  # preprocessing.scale(reps)
+
+    # choose number of principal components (PCs) based on explained variance ratio
+    n_comp = 1
+    for _ in range(reps.shape[0]):
+        pca = PCA(n_comp).fit(reps_scaled.T)
+        if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
+            break
+        n_comp += 1
+
+    # project feature matrix onto PCs
+    pc_reps = pca.components_ @ reps
+
+    return pca.components_, pc_reps, n_comp
+
+
+def principal_components_bias_variance_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <=1:
+        return None, None, n_comp
+        
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
+    # shape here is (Npc, Nfits)
+    delta_bias = pc_basis @ delta_bias
+
+    # compute biases, shape of biases is (Nfits)
+    biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+
+    return biases, np.asarray(variances), n_comp
+
+
+principal_components_bias_variance_datasets = collect(
+    "principal_components_bias_variance_dataset", ("data",)
+)
+
+
+def principal_components_variance_distribution_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, _, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <= 1:
+        return None, n_comp
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append([calc_chi2(sqrt_covmat_pdf, diffs)])
+
+    return np.asarray(variances), n_comp
+
+
+principal_components_variance_distribution_datasets = collect(
+    "principal_components_variance_distribution_dataset", ("data",)
+)
+
+
+def compute_num_components(covariance_matrix, threshold=0.99):
+    """
+    Compute the number of principal components to keep based on a desired explained variance threshold.
+
+    Parameters
+    ----------
+    covariance_matrix : np.ndarray, 2-D array
+
+    threshold : (float): Desired explained variance threshold (between 0 and 1).
+
+    Returns
+    -------
+    int
+        num_components: Number of principal components to keep.
+
+    """
+    eig_val, eig_vec = np.linalg.eig(covariance_matrix)
+    idx = eig_val.argsort()[::-1]
+    eig_val = eig_val[idx]
+    eig_vec = eig_vec[:, idx]
+
+    cumulative_sum = np.cumsum(eig_val)
+    total_sum = np.sum(eig_val)
+    num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold))
+
+    return num_components
+
+
+def pca_covmat(X, num_components):
+    """
+    given data X of shape (n,p), reduce its dimension to
+    (n,num_components) and return the covariance matrix
+    of the reduced data matrix.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    """
+    pca = PCA(num_components)
+    X_reduced = pca.fit_transform(X)
+    covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1)
+    return covariance
+
+
+def calc_chi2_pca(pdf_cov, diff, num_components):
+    """
+    Computes the chi2 between pdf_cov and diff by first projecting
+    them into num_components PCs.
+
+    Parameters
+    ----------
+    pdf_cov: np.ndarray
+
+    diff: np.ndarray
+
+    num_components: int
+
+    Returns
+    -------
+    float or np.ndarray depending on input
+
+    """
+    # Diagonalize the matrix
+    L, W = np.linalg.eig(pdf_cov)
+
+    # Sort the eigenvalues and eigenvectors from largest to smallest
+    idx = L.argsort()[::-1]
+    L = L[idx]
+    W = W[:, idx]
+
+    # Keep only the n_components largest eigenvectors
+    Wtilde = W[:, :num_components]
+
+    # Transform initial covariance matrix
+    pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real
+
+    # transform data
+    diff_pca = diff.T @ Wtilde
+
+    try:
+        sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca)
+    except:
+        raise ValueError(
+            f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept"
+        )
+
+    return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T))
+
+
+def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    similar to `dataset_fits_bias_replicas_variance_samples`.
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_eig: number of eigenvalues kept in PCA, i.e.,
+          ndata in the new, lower dimensional, space.
+    Note that we return Nfits values of the variance so that computing the
+    Bias - Variance ratio is straightforward.
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    # compute the PDF covariance matrix of the central samples
+    if centrals.shape[0] <= 1:
+        raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix")
+
+    pdf_cov_bias = np.cov(centrals.T)
+
+    # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues)
+    n_eig = compute_num_components(pdf_cov_bias, threshold)
+
+    # compute bias from PCs
+    diffs_bias = centrals.T - law_th.central_value[:, np.newaxis]
+    biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig)
+
+    # compute variances from PCs
+    variances = []
+
+    # loop over fits to compute variances
+    for i in range(reps.shape[0]):
+        diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+        pdf_cov_var = np.cov(reps[i, :, :])
+
+        variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig)))
+
+    return biases, np.asarray(variances), n_eig
+
+
+def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """
+    returns the bias and variance expected values as well as the number of
+    principal components.
+    """
+    biases, variances, n_eig = dataset_fits_bias_variance_samples_pca
+    return np.mean(biases), np.mean(variances), n_eig
+
+
+expected_datasets_fits_bias_variance_samples_pca = collect(
+    "expected_dataset_fits_bias_variance_samples_pca", ("data",)
+)
+
+
+def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """ """
+    biases, variances, _ = dataset_fits_bias_variance_samples_pca
+    sqrt_ratios = np.sqrt(biases / variances)
+    return sqrt_ratios
+
+
+def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    returns parameters of multi gaussian distribution of replicas
+    and central replicas
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    centrals_covmat = np.cov(centrals.T)
+    centrals_covmat = pca_covmat(
+        centrals, num_components=compute_num_components(centrals_covmat, threshold)
+    )
+    mean_centrals = np.mean(centrals, axis=0)
+
+    replicas_covmat = 0
+    for i in range(reps.shape[0]):
+        replicas_covmat = np.cov(reps[i, :, :])
+        replicas_covmat += pca_covmat(
+            reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold)
+        )
+    replicas_covmat /= reps.shape[0]
+    mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1)
+
+    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
new file mode 100644
index 0000000000..2f057510ab
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+from reportengine.table import table
+
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
+
+@table
+def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    records = []
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            bias = np.mean(biases)
+            variance = np.mean(variances)
+            rbv = bias / variance
+            sqrt_rbv = np.sqrt(bias / variance)
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=rbv,
+                ratio_sqrt=sqrt_rbv,
+            )
+        )
+        
+        except:
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=np.nan,
+                ratio_sqrt=np.nan,
+            ))
+    
+        
+            
+
+    df = pd.DataFrame.from_records(
+            records,
+            index="dataset",
+            columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
+        )
+    df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 5b545569e2..64d26c4d69 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -147,6 +147,41 @@ def fits_dataset_bias_variance(
         variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
     return biases, np.asarray(variances), len(law_th)
 
+@check_multifit_replicas
+def fits_normed_dataset_central_delta(
+    internal_multiclosure_dataset_loader,
+    _internal_max_reps=None,
+    _internal_min_reps=20):
+    """
+    For each fit calculate the difference between central expectation value and true val. Normalize this
+    value by the variance of the differences between replicas and central expectation value (different
+    for each fit but expected to vary only a little). Each observable central exp value is 
+    expected to be gaussianly distributed around the true value set by the fakepdf
+    """
+    closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
+    # The dimentions here are (fit, data point, replica)
+    #import ipdb; ipdb.set_trace()
+    reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
+    # One could mask here some reps in order to avoid redundancy of information
+    #TODO
+
+    n_data = len(law_th)
+    n_fits = np.shape(reps)[0]
+    deltas = []
+    # There are n_fits pdf_covariances
+    # flag to see whether to eliminate dataset
+    for i in range(n_fits):
+        bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
+        # bias diffs in the for loop should have dim = (n_obs)
+        sigmas = np.sqrt(np.var(reps[i], axis = 1))
+        # var diffs has shape (n_obs,reps)
+        bias_diffs = bias_diffs/sigmas
+        
+        deltas.append(bias_diffs.tolist())
+        # biases.shape = (n_fits, n_obs_cut/uncut)
+        # variances.shape = (n_fits, n_obs_cut/uncut, reps)
+    #import ipdb; ipdb.set_trace()
+    return np.asarray(deltas)
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
     """For a given dataset calculate the expected bias and variance across fits
diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index f4869367c4..faf5acdbdb 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -13,8 +13,14 @@
 from reportengine import collect
 from reportengine.checks import check_positive
 from reportengine.table import table
+from validphys import plotutils
 from validphys import plotoptions
 from validphys.core import CutsPolicy
+from validphys.closuretest import fits_normed_dataset_central_delta
+from validphys.closuretest import dataset_xi
+from validphys.closuretest import dataset_replica_and_central_diff
+
+from reportengine.figure import figuregen
 
 log = logging.getLogger(__name__)
 
@@ -114,7 +120,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     std_devs = np.std(central_deltas, axis = 0)
     means = np.mean(central_deltas, axis = 0)
     xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
-    #import ipdb; ipdb.set_trace()
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
         std_devs = np.concatenate((std_devs,std_devs))

From f9c5ade4a7525d81401455849152f3e5ba8b6890 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:33:18 +0100
Subject: [PATCH 03/14] Add other functions again

---
 .../multiclosure_inconsistent_output.py       | 292 +++++++++++++++++-
 1 file changed, 290 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 2f057510ab..33310361d1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -1,11 +1,87 @@
-import numpy as np
+"""
+multiclosure_inconsistent_output
+
+Module containing the actions which produce some output in validphys
+reports i.e figures or tables for (inconsistent) multiclosure 
+estimators in the space of data
+
+"""
 import pandas as pd
+import numpy as np
 from scipy import stats
 
+from reportengine.figure import figure, figuregen
 from reportengine.table import table
 
+from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
 
+
+@figuregen
+def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances))
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+            pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue
+            fig, ax = plotutils.subplots()
+            ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}")
+            ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+            ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}")
+
+            ax.legend()
+
+            yield fig
+        
+        except:
+            fig, ax = plotutils.subplots()
+            ax.plot([], [], label=f"Dataset: {str(ds)}")
+            ax.legend()
+            yield fig
+
+@figuregen
+def plot_variance_distribution_datasets(
+    principal_components_variance_distribution_datasets, each_dataset
+):
+    """
+    TODO
+    """
+
+    for pc_var_dataset, ds in zip(
+        principal_components_variance_distribution_datasets, each_dataset
+    ):
+        variances, n_comp = pc_var_dataset
+        try:
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+
+            for i, var_fit in enumerate(variances):
+                pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue
+
+                fig, ax = plotutils.subplots()
+                ax.hist(
+                    var_fit[0],
+                    density=True,
+                    bins='auto',
+                    label=f"Variance {str(ds)}, p={pvalue_ks:.3f}",
+                )
+                ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+                ax.set_title(f"Fit {i}")
+                ax.legend()
+
+                yield fig
+        except:
+            fig, ax = plotutils.subplots()
+            yield fig
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
             columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
         )
     df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
-    return df
\ No newline at end of file
+    return df
+
+
+@table
+def datasets_bias_variance_pca_table(
+    expected_datasets_fits_bias_variance_samples_pca, each_dataset
+):
+    """For each dataset calculate the expected bias and expected variance computed by
+    first projecting the covariance matrix into a lower dimensional space trough PCA.
+
+    """
+    records = []
+    for ds, (bias, var, ndata) in zip(
+        each_dataset, expected_datasets_fits_bias_variance_samples_pca
+    ):
+        records.append(
+            dict(
+                dataset=str(ds),
+                ndata=ndata,
+                bias=bias,
+                variance=var,
+                ratio=bias / var,
+                ratio_sqrt=np.sqrt(bias / var),
+            )
+        )
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"),
+    )
+    df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca):
+    """"""
+    sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca
+    fig, ax = plotutils.subplots()
+    ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio")
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the variances (k) defined as the
+    distance between central replica and replica (k) in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (_, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the biases (l) defined as the
+    distance between central replica and underlying law in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias_dist, _, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    histogram of the distribution of the sqrt ratio of bias and variance computed with
+    the estimated "PDF" covariance matrix (reduced with PCA).
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    ratios = []
+    labels = []
+    for (bias_dist, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_variance_samples_pca, dataspecs
+    ):
+        labels.append(spec['speclabel'])
+        ratios.append(bias_dist / variance_dist)
+
+    ax.hist(ratios, bins='auto', density=True, label=labels)
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+    """
+    histogram of the distribution of the biases obtained by bootstrapping with
+    `fits_bootstrap_dataset_bias_variance` over the existing fits.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        label = spec['speclabel']
+
+        ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_ratio_bias_variance_distribution_bootstrap(
+    multi_fits_bootstrap_dataset_bias_variance, dataspecs
+):
+    """
+    histogram of the ratio bias variance distribution obtained by bootstrapping bias
+    and variance with `fits_bootstrap_dataset_bias_variance`.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        ratio = bias / variance
+        label = spec['speclabel']
+
+        ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+@figuregen
+def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
+    """
+    TODO
+    """
+
+    evr_range = np.linspace(0.90, 0.995, 10) 
+    
+
+    for ds in each_dataset:
+        l2_cond = []
+        dof = []
+
+        for evr in evr_range:
+            _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr)
+            
+            covmat_pdf = np.cov(pc_reps)
+            
+            # compute condition number
+            l2_cond.append(np.linalg.cond(covmat_pdf))
+            dof.append(n_comp)
+            
+
+
+        fig, ax1 = plotutils.subplots(figsize=(15,4))
+        ax1.plot(evr_range, l2_cond, "b-o", label="condition number")
+        ax1.set_title(f"Dataset: {str(ds)}")
+        ax1.set_xlabel("EVR")
+        ax1.set_ylabel("Covariance Matrix Condition Number", color="b")
+        ax1.tick_params('y', color="b")
+
+        ax2 = ax1.twinx()
+
+        # Plot the second dataset on the right y-axis
+        ax2.plot(evr_range, dof, 'r-o', label="dof")
+        ax2.set_ylabel('Degrees of freedom', color="r")
+        ax2.tick_params('y', color="r")
+        # ax1.legend()
+        # ax2.legend()
+        lines1, labels1 = ax1.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        lines = lines1 + lines2
+        labels = labels1 + labels2
+        ax1.legend(lines, labels, loc='upper left')
+
+
+        yield fig
\ No newline at end of file

From 813f019697f35a2a95cfe120c69cb1db527a205e Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Wed, 21 Feb 2024 11:49:50 +0100
Subject: [PATCH 04/14] Add parse_variancepdf

---
 validphys2/src/validphys/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index 0add44c70c..042739b73e 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -1036,6 +1036,9 @@ def produce_theoryid(self, theory):
                               "Expected that key to be present.")
         return theory['theoryid']
     """
+    def parse_variancepdf(self, name):
+        """PDF set used to generate the t0 covmat."""
+        return self.parse_pdf(name)
 
     def produce_pdf_id(self, pdf) -> str:
         """Return a string containing the PDF's LHAPDF ID"""

From 7f8229fbd04c1b8ab7f464addcdfe02124565e55 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 15:55:15 +0100
Subject: [PATCH 05/14] Init analysis

---
 validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 295bcd30fb..f4869367c4 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -97,6 +97,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True):
 
 all_kinlimits = collect(kinlimits, ('dataset_inputs',))
 
+def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
+                        _internal_max_reps=None,
+                        _internal_min_reps=20):
+    """
+    Load in a dictionary all the specs of a dataset meaning:
+    - ds name
+    - ds coords
+    - standard deviation (in multiclosure)
+    - mean (in multiclosure again)
+    - (x,Q^2) coords
+    """
+    xq2_map_obj = xq2map_with_cuts(commondata, cuts)
+    coords = xq2_map_obj[2]
+    central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
+    std_devs = np.std(central_deltas, axis = 0)
+    means = np.mean(central_deltas, axis = 0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+    #import ipdb; ipdb.set_trace()
+    # for case of DY observables we have 2 (x,Q) for each experimental point
+    if coords[0].shape[0] != std_devs.shape[0]:
+        std_devs = np.concatenate((std_devs,std_devs))
+        means = np.concatenate((means,means))
+        xi = np.concatenate((xi,xi))
+    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
 
 @table
 def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True):
@@ -186,6 +210,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None):
         'data_input',
     ),
 )
+xq2_data_map = collect("xq2_dataset_map",("data",))
+
+@figuregen
+def xq2_data_prcs_maps(xq2_data_map):
+    import matplotlib.pyplot as plt
+    import matplotlib.colors
+    from matplotlib.colors import ListedColormap
+    keys = ["std_devs","xi"]
+    for elem in xq2_data_map:
+        for k in keys:
+            fig, ax = plotutils.subplots()
+            im = ax.scatter(elem['x_coords'],elem['Q_coords'],
+                                c=(np.asarray(elem[k])), 
+                                cmap='viridis',
+                                label = elem['name'])
+            fig.colorbar(im,label=k)
+            ax.set_xscale('log')  # Set x-axis to log scale
+            ax.set_yscale('log')  # Set y-axis to log scale
+            ax.set_xlabel('x')
+            ax.set_ylabel('Q2')
+            ax.set_title(elem["name"]+" "+k)
+            yield fig
 
 
 def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False):

From 36863c3153b039592e5b4b7f3bd05d6d88f1b701 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:15:05 +0100
Subject: [PATCH 06/14] Add other functions

---
 .../multiclosure_inconsistent.py              | 362 ++++++++++++++++++
 .../multiclosure_inconsistent_output.py       |  54 +++
 .../src/validphys/closuretest/multiclosure.py |  35 ++
 validphys2/src/validphys/kinematics.py        |   7 +-
 4 files changed, 457 insertions(+), 1 deletion(-)
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
new file mode 100644
index 0000000000..4cb12f2d30
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -0,0 +1,362 @@
+"""
+closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+
+Module containing all of the statistical estimators which are
+averaged across multiple inconsistent fits. The actions
+in this module are used to produce results which are plotted in
+``multiclosure_inconsistent_output.py``
+
+"""
+
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+
+from validphys import covmats
+from validphys.calcutils import calc_chi2
+from validphys.results import ThPredictionsResult
+
+from reportengine import collect
+
+
+""" To load several multiclosure fits. Useful for inconsistent closure test analysis """
+multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",))
+
+multi_dataset_fits_bias_replicas_variance_samples = collect(
+    "dataset_fits_bias_replicas_variance_samples", ("dataspecs",)
+)
+
+multi_fits_bootstrap_dataset_bias_variance = collect(
+    "fits_bootstrap_dataset_bias_variance", ("dataspecs",)
+)
+
+multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",))
+
+multi_dataset_fits_bias_variance_samples_pca = collect(
+    "dataset_fits_bias_variance_samples_pca", ("dataspecs",)
+)
+
+
+def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93):
+    """
+    Compute the principal components of theory predictions replica matrix
+    (Ndat x Nrep feature matrix).
+
+    Parameters
+    ----------
+    dataset: (DataSetSpec, DataGroupSpec)
+        dataset for which the theory predictions and t0 covariance matrix
+        will be loaded. Note that due to the structure of `validphys` this
+        function can be overloaded to accept a DataGroupSpec.
+
+    fits_pdf: list
+        list of PDF objects produced from performing multiple closure tests
+        fits. Each fit should have a different filterseed but the same
+        underlying law used to generate the pseudodata.
+
+    variancepdf: validphys.core.PDF
+            PDF object used to estimate the variance of the fits.
+
+    explained_variance_ratio: float, default is 0.93
+
+    Returns
+    -------
+    tuple
+        2D tuple:
+        - matrix of the principal components (PCs) of shape (N_pc, N_dat)
+        - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
+
+    """
+    # fits_dataset_predictions = [
+    #     ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf
+    # ]
+
+    # dimensions here are (Nfits, Ndat, Nrep)
+    # reps = np.asarray([th.error_members for th in fits_dataset_predictions])
+
+    # reshape so as to get PCs from all the samples
+    # reps = reps.reshape(reps.shape[1],-1)
+
+    # get replicas from variance fit, used to estimate variance
+    reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
+
+    # rescale feature matrix
+    reps_scaled = reps  # preprocessing.scale(reps)
+
+    # choose number of principal components (PCs) based on explained variance ratio
+    n_comp = 1
+    for _ in range(reps.shape[0]):
+        pca = PCA(n_comp).fit(reps_scaled.T)
+        if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
+            break
+        n_comp += 1
+
+    # project feature matrix onto PCs
+    pc_reps = pca.components_ @ reps
+
+    return pca.components_, pc_reps, n_comp
+
+
+def principal_components_bias_variance_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <=1:
+        return None, None, n_comp
+        
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
+    # shape here is (Npc, Nfits)
+    delta_bias = pc_basis @ delta_bias
+
+    # compute biases, shape of biases is (Nfits)
+    biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+
+    return biases, np.asarray(variances), n_comp
+
+
+principal_components_bias_variance_datasets = collect(
+    "principal_components_bias_variance_dataset", ("data",)
+)
+
+
+def principal_components_variance_distribution_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, _, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <= 1:
+        return None, n_comp
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append([calc_chi2(sqrt_covmat_pdf, diffs)])
+
+    return np.asarray(variances), n_comp
+
+
+principal_components_variance_distribution_datasets = collect(
+    "principal_components_variance_distribution_dataset", ("data",)
+)
+
+
+def compute_num_components(covariance_matrix, threshold=0.99):
+    """
+    Compute the number of principal components to keep based on a desired explained variance threshold.
+
+    Parameters
+    ----------
+    covariance_matrix : np.ndarray, 2-D array
+
+    threshold : (float): Desired explained variance threshold (between 0 and 1).
+
+    Returns
+    -------
+    int
+        num_components: Number of principal components to keep.
+
+    """
+    eig_val, eig_vec = np.linalg.eig(covariance_matrix)
+    idx = eig_val.argsort()[::-1]
+    eig_val = eig_val[idx]
+    eig_vec = eig_vec[:, idx]
+
+    cumulative_sum = np.cumsum(eig_val)
+    total_sum = np.sum(eig_val)
+    num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold))
+
+    return num_components
+
+
+def pca_covmat(X, num_components):
+    """
+    given data X of shape (n,p), reduce its dimension to
+    (n,num_components) and return the covariance matrix
+    of the reduced data matrix.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    """
+    pca = PCA(num_components)
+    X_reduced = pca.fit_transform(X)
+    covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1)
+    return covariance
+
+
+def calc_chi2_pca(pdf_cov, diff, num_components):
+    """
+    Computes the chi2 between pdf_cov and diff by first projecting
+    them into num_components PCs.
+
+    Parameters
+    ----------
+    pdf_cov: np.ndarray
+
+    diff: np.ndarray
+
+    num_components: int
+
+    Returns
+    -------
+    float or np.ndarray depending on input
+
+    """
+    # Diagonalize the matrix
+    L, W = np.linalg.eig(pdf_cov)
+
+    # Sort the eigenvalues and eigenvectors from largest to smallest
+    idx = L.argsort()[::-1]
+    L = L[idx]
+    W = W[:, idx]
+
+    # Keep only the n_components largest eigenvectors
+    Wtilde = W[:, :num_components]
+
+    # Transform initial covariance matrix
+    pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real
+
+    # transform data
+    diff_pca = diff.T @ Wtilde
+
+    try:
+        sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca)
+    except:
+        raise ValueError(
+            f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept"
+        )
+
+    return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T))
+
+
+def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    similar to `dataset_fits_bias_replicas_variance_samples`.
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_eig: number of eigenvalues kept in PCA, i.e.,
+          ndata in the new, lower dimensional, space.
+    Note that we return Nfits values of the variance so that computing the
+    Bias - Variance ratio is straightforward.
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    # compute the PDF covariance matrix of the central samples
+    if centrals.shape[0] <= 1:
+        raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix")
+
+    pdf_cov_bias = np.cov(centrals.T)
+
+    # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues)
+    n_eig = compute_num_components(pdf_cov_bias, threshold)
+
+    # compute bias from PCs
+    diffs_bias = centrals.T - law_th.central_value[:, np.newaxis]
+    biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig)
+
+    # compute variances from PCs
+    variances = []
+
+    # loop over fits to compute variances
+    for i in range(reps.shape[0]):
+        diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+        pdf_cov_var = np.cov(reps[i, :, :])
+
+        variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig)))
+
+    return biases, np.asarray(variances), n_eig
+
+
+def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """
+    returns the bias and variance expected values as well as the number of
+    principal components.
+    """
+    biases, variances, n_eig = dataset_fits_bias_variance_samples_pca
+    return np.mean(biases), np.mean(variances), n_eig
+
+
+expected_datasets_fits_bias_variance_samples_pca = collect(
+    "expected_dataset_fits_bias_variance_samples_pca", ("data",)
+)
+
+
+def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """ """
+    biases, variances, _ = dataset_fits_bias_variance_samples_pca
+    sqrt_ratios = np.sqrt(biases / variances)
+    return sqrt_ratios
+
+
+def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    returns parameters of multi gaussian distribution of replicas
+    and central replicas
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    centrals_covmat = np.cov(centrals.T)
+    centrals_covmat = pca_covmat(
+        centrals, num_components=compute_num_components(centrals_covmat, threshold)
+    )
+    mean_centrals = np.mean(centrals, axis=0)
+
+    replicas_covmat = 0
+    for i in range(reps.shape[0]):
+        replicas_covmat = np.cov(reps[i, :, :])
+        replicas_covmat += pca_covmat(
+            reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold)
+        )
+    replicas_covmat /= reps.shape[0]
+    mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1)
+
+    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
new file mode 100644
index 0000000000..2f057510ab
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+from reportengine.table import table
+
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
+
+@table
+def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    records = []
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            bias = np.mean(biases)
+            variance = np.mean(variances)
+            rbv = bias / variance
+            sqrt_rbv = np.sqrt(bias / variance)
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=rbv,
+                ratio_sqrt=sqrt_rbv,
+            )
+        )
+        
+        except:
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=np.nan,
+                ratio_sqrt=np.nan,
+            ))
+    
+        
+            
+
+    df = pd.DataFrame.from_records(
+            records,
+            index="dataset",
+            columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
+        )
+    df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 5b545569e2..64d26c4d69 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -147,6 +147,41 @@ def fits_dataset_bias_variance(
         variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
     return biases, np.asarray(variances), len(law_th)
 
+@check_multifit_replicas
+def fits_normed_dataset_central_delta(
+    internal_multiclosure_dataset_loader,
+    _internal_max_reps=None,
+    _internal_min_reps=20):
+    """
+    For each fit calculate the difference between central expectation value and true val. Normalize this
+    value by the variance of the differences between replicas and central expectation value (different
+    for each fit but expected to vary only a little). Each observable central exp value is 
+    expected to be gaussianly distributed around the true value set by the fakepdf
+    """
+    closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
+    # The dimentions here are (fit, data point, replica)
+    #import ipdb; ipdb.set_trace()
+    reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
+    # One could mask here some reps in order to avoid redundancy of information
+    #TODO
+
+    n_data = len(law_th)
+    n_fits = np.shape(reps)[0]
+    deltas = []
+    # There are n_fits pdf_covariances
+    # flag to see whether to eliminate dataset
+    for i in range(n_fits):
+        bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
+        # bias diffs in the for loop should have dim = (n_obs)
+        sigmas = np.sqrt(np.var(reps[i], axis = 1))
+        # var diffs has shape (n_obs,reps)
+        bias_diffs = bias_diffs/sigmas
+        
+        deltas.append(bias_diffs.tolist())
+        # biases.shape = (n_fits, n_obs_cut/uncut)
+        # variances.shape = (n_fits, n_obs_cut/uncut, reps)
+    #import ipdb; ipdb.set_trace()
+    return np.asarray(deltas)
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
     """For a given dataset calculate the expected bias and variance across fits
diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index f4869367c4..faf5acdbdb 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -13,8 +13,14 @@
 from reportengine import collect
 from reportengine.checks import check_positive
 from reportengine.table import table
+from validphys import plotutils
 from validphys import plotoptions
 from validphys.core import CutsPolicy
+from validphys.closuretest import fits_normed_dataset_central_delta
+from validphys.closuretest import dataset_xi
+from validphys.closuretest import dataset_replica_and_central_diff
+
+from reportengine.figure import figuregen
 
 log = logging.getLogger(__name__)
 
@@ -114,7 +120,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     std_devs = np.std(central_deltas, axis = 0)
     means = np.mean(central_deltas, axis = 0)
     xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
-    #import ipdb; ipdb.set_trace()
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
         std_devs = np.concatenate((std_devs,std_devs))

From 273e16b6db4eea7f9f38c473de394e7d31bff739 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:33:18 +0100
Subject: [PATCH 07/14] Add other functions again

---
 .../multiclosure_inconsistent_output.py       | 292 +++++++++++++++++-
 1 file changed, 290 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 2f057510ab..33310361d1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -1,11 +1,87 @@
-import numpy as np
+"""
+multiclosure_inconsistent_output
+
+Module containing the actions which produce some output in validphys
+reports i.e figures or tables for (inconsistent) multiclosure 
+estimators in the space of data
+
+"""
 import pandas as pd
+import numpy as np
 from scipy import stats
 
+from reportengine.figure import figure, figuregen
 from reportengine.table import table
 
+from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
 
+
+@figuregen
+def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances))
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+            pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue
+            fig, ax = plotutils.subplots()
+            ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}")
+            ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+            ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}")
+
+            ax.legend()
+
+            yield fig
+        
+        except:
+            fig, ax = plotutils.subplots()
+            ax.plot([], [], label=f"Dataset: {str(ds)}")
+            ax.legend()
+            yield fig
+
+@figuregen
+def plot_variance_distribution_datasets(
+    principal_components_variance_distribution_datasets, each_dataset
+):
+    """
+    TODO
+    """
+
+    for pc_var_dataset, ds in zip(
+        principal_components_variance_distribution_datasets, each_dataset
+    ):
+        variances, n_comp = pc_var_dataset
+        try:
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+
+            for i, var_fit in enumerate(variances):
+                pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue
+
+                fig, ax = plotutils.subplots()
+                ax.hist(
+                    var_fit[0],
+                    density=True,
+                    bins='auto',
+                    label=f"Variance {str(ds)}, p={pvalue_ks:.3f}",
+                )
+                ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+                ax.set_title(f"Fit {i}")
+                ax.legend()
+
+                yield fig
+        except:
+            fig, ax = plotutils.subplots()
+            yield fig
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
             columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
         )
     df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
-    return df
\ No newline at end of file
+    return df
+
+
+@table
+def datasets_bias_variance_pca_table(
+    expected_datasets_fits_bias_variance_samples_pca, each_dataset
+):
+    """For each dataset calculate the expected bias and expected variance computed by
+    first projecting the covariance matrix into a lower dimensional space trough PCA.
+
+    """
+    records = []
+    for ds, (bias, var, ndata) in zip(
+        each_dataset, expected_datasets_fits_bias_variance_samples_pca
+    ):
+        records.append(
+            dict(
+                dataset=str(ds),
+                ndata=ndata,
+                bias=bias,
+                variance=var,
+                ratio=bias / var,
+                ratio_sqrt=np.sqrt(bias / var),
+            )
+        )
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"),
+    )
+    df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca):
+    """"""
+    sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca
+    fig, ax = plotutils.subplots()
+    ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio")
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the variances (k) defined as the
+    distance between central replica and replica (k) in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (_, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the biases (l) defined as the
+    distance between central replica and underlying law in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias_dist, _, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    histogram of the distribution of the sqrt ratio of bias and variance computed with
+    the estimated "PDF" covariance matrix (reduced with PCA).
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    ratios = []
+    labels = []
+    for (bias_dist, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_variance_samples_pca, dataspecs
+    ):
+        labels.append(spec['speclabel'])
+        ratios.append(bias_dist / variance_dist)
+
+    ax.hist(ratios, bins='auto', density=True, label=labels)
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+    """
+    histogram of the distribution of the biases obtained by bootstrapping with
+    `fits_bootstrap_dataset_bias_variance` over the existing fits.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        label = spec['speclabel']
+
+        ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_ratio_bias_variance_distribution_bootstrap(
+    multi_fits_bootstrap_dataset_bias_variance, dataspecs
+):
+    """
+    histogram of the ratio bias variance distribution obtained by bootstrapping bias
+    and variance with `fits_bootstrap_dataset_bias_variance`.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        ratio = bias / variance
+        label = spec['speclabel']
+
+        ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+@figuregen
+def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
+    """
+    TODO
+    """
+
+    evr_range = np.linspace(0.90, 0.995, 10) 
+    
+
+    for ds in each_dataset:
+        l2_cond = []
+        dof = []
+
+        for evr in evr_range:
+            _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr)
+            
+            covmat_pdf = np.cov(pc_reps)
+            
+            # compute condition number
+            l2_cond.append(np.linalg.cond(covmat_pdf))
+            dof.append(n_comp)
+            
+
+
+        fig, ax1 = plotutils.subplots(figsize=(15,4))
+        ax1.plot(evr_range, l2_cond, "b-o", label="condition number")
+        ax1.set_title(f"Dataset: {str(ds)}")
+        ax1.set_xlabel("EVR")
+        ax1.set_ylabel("Covariance Matrix Condition Number", color="b")
+        ax1.tick_params('y', color="b")
+
+        ax2 = ax1.twinx()
+
+        # Plot the second dataset on the right y-axis
+        ax2.plot(evr_range, dof, 'r-o', label="dof")
+        ax2.set_ylabel('Degrees of freedom', color="r")
+        ax2.tick_params('y', color="r")
+        # ax1.legend()
+        # ax2.legend()
+        lines1, labels1 = ax1.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        lines = lines1 + lines2
+        labels = labels1 + labels2
+        ax1.legend(lines, labels, loc='upper left')
+
+
+        yield fig
\ No newline at end of file

From 3c1f113e76e14a88a91dba6894ec0198c4f247c5 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Wed, 21 Feb 2024 11:49:50 +0100
Subject: [PATCH 08/14] Add parse_variancepdf

---
 validphys2/src/validphys/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index 1b6ba79cf8..6caf36531b 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -1038,6 +1038,9 @@ def produce_theoryid(self, theory):
                               "Expected that key to be present.")
         return theory['theoryid']
     """
+    def parse_variancepdf(self, name):
+        """PDF set used to generate the t0 covmat."""
+        return self.parse_pdf(name)
 
     def produce_pdf_id(self, pdf) -> str:
         """Return a string containing the PDF's LHAPDF ID"""

From af4a08f063524a011ecd209fd86051f88a7a4ea3 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Wed, 21 Feb 2024 15:11:04 +0100
Subject: [PATCH 09/14] Remove ipdbs and make sure multiclosure_inconsistent
 work

---
 validphys2/src/validphys/closuretest/__init__.py     | 2 ++
 validphys2/src/validphys/closuretest/multiclosure.py | 2 --
 validphys2/src/validphys/config.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/__init__.py b/validphys2/src/validphys/closuretest/__init__.py
index 02950f15d6..38bc1228c5 100644
--- a/validphys2/src/validphys/closuretest/__init__.py
+++ b/validphys2/src/validphys/closuretest/__init__.py
@@ -11,3 +11,5 @@
 from validphys.closuretest.multiclosure_pdf_output import *
 from validphys.closuretest.multiclosure_preprocessing import *
 from validphys.closuretest.multiclosure_pseudodata import *
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import *
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent_output import *
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 64d26c4d69..ff71e3caf0 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -160,7 +160,6 @@ def fits_normed_dataset_central_delta(
     """
     closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
-    #import ipdb; ipdb.set_trace()
     reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # One could mask here some reps in order to avoid redundancy of information
     #TODO
@@ -180,7 +179,6 @@ def fits_normed_dataset_central_delta(
         deltas.append(bias_diffs.tolist())
         # biases.shape = (n_fits, n_obs_cut/uncut)
         # variances.shape = (n_fits, n_obs_cut/uncut, reps)
-    #import ipdb; ipdb.set_trace()
     return np.asarray(deltas)
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index 6caf36531b..14b2123dc7 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -1041,7 +1041,7 @@ def produce_theoryid(self, theory):
     def parse_variancepdf(self, name):
         """PDF set used to generate the t0 covmat."""
         return self.parse_pdf(name)
-
+    
     def produce_pdf_id(self, pdf) -> str:
         """Return a string containing the PDF's LHAPDF ID"""
         return pdf.name

From 0b1e1e4836d8c309bfb9bb1053473decd9e0c9fa Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Thu, 22 Feb 2024 11:39:43 +0100
Subject: [PATCH 10/14] Add multiclosure comparefits

---
 pyproject.toml                                |   1 +
 .../__init__.py                               |   3 +
 .../comparefits_template.md                   | 121 +++++++++
 .../data.md                                   |   5 +
 .../exponents.md                              |  15 ++
 .../lumi.md                                   |   9 +
 .../multiclosure_analysis.yaml                | 244 ++++++++++++++++++
 .../multiclosure_template.md                  |  14 +
 .../pca_template.md                           |   2 +
 .../pdf.md                                    |  24 ++
 .../single_point_template.md                  |   2 +
 .../src/validphys/scripts/vp_multiclosure.py  | 159 ++++++++++++
 12 files changed, 599 insertions(+)
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
 create mode 100644 validphys2/src/validphys/scripts/vp_multiclosure.py

diff --git a/pyproject.toml b/pyproject.toml
index 73b5a6cda4..d1f148e07b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ wiki-upload = "validphys.scripts.wiki_upload:main"
 vp-get = "validphys.scripts.vp_get:main"
 vp-comparefits = "validphys.scripts.vp_comparefits:main"
 vp-fitrename = "validphys.scripts.vp_fitrename:main"
+vp-multiclosure = "validphys.scripts.vp_multiclosure:main"
 vp-checktheory = "validphys.scripts.vp_checktheory:main"
 vp-rebuild-data = "validphys.scripts.vp_rebuild_data:main"
 vp-pdfrename = "validphys.scripts.vp_pdfrename:main"
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
new file mode 100644
index 0000000000..9a64b5152a
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
@@ -0,0 +1,3 @@
+import pathlib
+
+template_path = pathlib.Path(__file__).with_name('multiclosure_analysis.yaml')
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
new file mode 100644
index 0000000000..bafb0f5fa5
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
@@ -0,0 +1,121 @@
+%NNPDF report comparing {@ current fit_id @} and {@ reference fit_id @}
+  
+Summary
+-------
+  
+We are comparing:
+  
+  - {@ current fit @} (`{@ current fit_id @}`): {@ current description @}
+  - {@ reference fit @} (`{@ reference fit_id @}`): {@ reference description @}
+  
+  
+{@ summarise_fits @}
+  
+  
+t0 losses
+---------
+{@ dataspecs::t0_info t0_chi2_info_table @}
+  
+Theory covariance summary
+-------------------------
+{@summarise_theory_covmat_fits@}
+  
+Dataset properties
+------------------
+{@current fit_datasets_properties_table@}
+  
+Distances
+---------
+{@with Scales@}
+### {@Scaletitle@}
+{@with Normalize::Basespecs::PDFscalespecs::Distspecs@}
+#### {@Basistitle@}, {@Xscaletitle@}
+{@plot_pdfdistances@}
+{@plot_pdfvardistances@}
+{@endwith@}
+{@endwith@}
+  
+PDF arc-lengths
+---------------
+{@Basespecs plot_arc_lengths@}
+  
+Sum rules
+---------
+{@with pdfs@}
+### {@pdf@}
+  
+#### Known sum rules
+  
+{@sum_rules_table@}
+  
+#### Unknown sum rules
+  
+{@unknown_sum_rules_table@}
+  
+{@endwith@}
+  
+PDF plots
+---------
+{@with Scales@}
+[Plots at {@Scaletitle@}]({@pdf_report report@})
+{@endwith@}
+  
+Luminosities
+------------
+{@with Energies@}
+[Plots at {@Energytitle@}]({@lumi_report report@})
+{@endwith@}
+  
+Effective exponents
+-------------------
+[Detailed information]({@exponents_report report@})
+  
+Training lengths
+----------------
+{@fits plot_training_length@}
+  
+Training-validation
+-------------------
+{@fits plot_training_validation@}
+  
+{@with DataGroups@}
+$\chi^2$ by {@processed_metadata_group@}
+----------------------------------------
+{@plot_fits_groups_data_chi2@}
+{@endwith@}
+  
+  
+$\chi^2$ by dataset
+-------------------
+### Plot
+{@plot_fits_datasets_chi2@}
+### Table
+{@ProcessGroup fits_chi2_table(show_total=true)@}
+  
+  
+{@with DataGroups@}
+$\phi$ by {@processed_metadata_group@}
+--------------------------------------
+{@plot_fits_groups_data_phi@}
+{@endwith@}
+  
+Dataset plots
+-------------
+{@with matched_datasets_from_dataspecs@}
+[Plots for {@dataset_name@}]({@dataset_report report@})
+{@endwith@}
+  
+Positivity
+----------
+{@with matched_positivity_from_dataspecs@}
+{@plot_dataspecs_positivity@}
+{@endwith@}
+  
+Dataset differences and cuts
+----------------------------
+{@print_dataset_differences@}
+{@print_different_cuts@}
+  
+Code versions
+-------------
+{@fits_version_table@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
new file mode 100644
index 0000000000..c99e62f3d4
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
@@ -0,0 +1,5 @@
+% Data-theory comparison for {@dataset_name@}
+# Absolute
+{@plot_fancy_dataspecs@}
+# Normalized
+{@Datanorm plot_fancy_dataspecs@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
new file mode 100644
index 0000000000..9736e70775
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
@@ -0,0 +1,15 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# Effective preprocessing information
+
+## Plots
+### alpha exponent
+{@current::basisfromfit plot_alpha_eff@}
+### beta exponent
+{@current::basisfromfit plot_beta_eff@}
+
+## Tables
+{@with fits@}
+### Next effective exponents for {@fit@}
+{@effective_exponents_table@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
new file mode 100644
index 0000000000..a077a4a21f
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
@@ -0,0 +1,9 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# Luminosity plots
+
+{@with Normalize@}
+{@with lumi_channels@}
+{@plot_lumi1d@}
+{@endwith@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
new file mode 100644
index 0000000000..dd4a2b92a5
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -0,0 +1,244 @@
+meta:
+  title: Multiclosure report
+  author: AB
+  keywords: [inconsistencies, multiclosure]
+
+#COMPAREFITS SETTINGS
+
+positivity:
+  from_: fit
+
+description:
+  from_: fit
+
+dataset_inputs:
+  from_: fit
+  
+t0_info:
+  - use_t0: True
+    datacuts:
+      from_: fit
+    t0pdfset:
+      from_: datacuts 
+compare_settings:
+  current:
+    fit: {id: "240210_mnc_dis_ict_lam02"}
+    pdf: {id: "240210_mnc_dis_ict_lam02", label: "Current Fit"}
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    speclabel: "Current Fit"
+  
+  reference:
+    fit: {id: "240210_mnc_dis_ict_lam04"}
+    pdf: {id: "240210_mnc_dis_ict_lam04", label: "Reference Fit" }
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    speclabel: "Reference Fit"
+    
+  pdfs:
+    - from_: current
+    - from_: reference
+
+  fits:
+    - from_: current
+    - from_: reference
+
+  use_cuts: "fromfit"
+  use_weights_in_covmat: False
+
+  Q: 1.651
+
+  Scales:
+    - Q: 1.651
+      Scaletitle: "Q = 1.65 GeV"
+    - Q: 100
+      Scaletitle: "Q = 100 GeV"
+
+  PDFnormalize:
+    - Normtitle: Absolute
+
+    - normalize_to: 2
+      Normtitle: Ratio
+
+  Basespecs:
+    - basis: CCBAR_ASYMM_FLAVOUR
+      Basistitle: Flavour basis
+    - basis: CCBAR_ASYMM
+      Basistitle: Evolution basis
+
+  PDFscalespecs:
+    - xscale: log
+      Xscaletitle: Log
+    - xscale: linear
+      Xscaletitle: Linear
+
+  Energies:
+    - sqrts: 13000
+      Energytitle: "13 TeV"
+
+  lumi_channels:
+    - gg
+    - gq
+    - qq
+    - qqbar
+    - uubar
+    - ddbar
+    - udbar
+    - dubar
+
+  Distspecs:
+    - ymin: 0
+      ymax: 20
+
+  pos_use_kin: True
+
+  dataset_report:
+    meta: Null
+    template: data.md
+
+  pdf_report:
+    meta: Null
+    template: pdf.md
+
+  exponents_report:
+    meta: Null
+    template: exponents.md
+
+  lumi_report:
+    meta: Null
+    template: lumi.md
+  
+  dataspecs:
+    - theoryid:
+        from_: current
+      pdf:
+        from_: current
+      fit:
+        from_: current
+      speclabel:
+        from_: current
+
+    - theoryid:
+        from_: reference
+      pdf:
+        from_: reference
+      fit:
+        from_: reference
+      speclabel:
+        from_: reference
+        
+  Normalize:
+    normalize_to: 2
+
+  Datanorm:
+    normalize_to: data
+
+  DataGroups:
+    - metadata_group: nnpdf31_process
+    - metadata_group: experiment
+
+  ProcessGroup:
+      metadata_group: nnpdf31_process
+
+##################################################################
+#OTHER ANALYSIS SETTINGS
+
+lambdavalues:
+  - label: "LAMBDA02"
+    fit: 230124_dis_ict_lam02_fs_122996
+    dataset_inputs:
+      from_: fit
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    use_t0: True
+    t0pdfset: 210223-mw-000_fakepdf
+    use_cuts: "internal"
+    explained_variance_ratio: 0.99
+    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    fits:
+      - 230124_dis_ict_lam02_fs_122996
+      - 230124_dis_ict_lam02_fs_152326
+      - 230124_dis_ict_lam02_fs_196144
+      - 230124_dis_ict_lam02_fs_200988
+      - 230124_dis_ict_lam02_fs_257767
+      - 230124_dis_ict_lam02_fs_31144
+      - 230124_dis_ict_lam02_fs_374984
+      - 230124_dis_ict_lam02_fs_383440
+      - 230124_dis_ict_lam02_fs_394255
+      - 230124_dis_ict_lam02_fs_431574
+      - 230124_dis_ict_lam02_fs_498979
+      - 230124_dis_ict_lam02_fs_504344
+      - 230124_dis_ict_lam02_fs_52722
+      - 230124_dis_ict_lam02_fs_54647
+      - 230124_dis_ict_lam02_fs_562489
+      - 230124_dis_ict_lam02_fs_57171
+      - 230124_dis_ict_lam02_fs_661665
+      - 230124_dis_ict_lam02_fs_683404
+      - 230124_dis_ict_lam02_fs_752096
+      - 230124_dis_ict_lam02_fs_760929
+      - 230124_dis_ict_lam02_fs_786382
+      - 230124_dis_ict_lam02_fs_834248
+      - 230124_dis_ict_lam02_fs_868248
+      - 230124_dis_ict_lam02_fs_897760
+      - 230124_dis_ict_lam02_fs_916690
+      - 230124_dis_ict_lam02_fs_966390
+  - label: "LAMBDA04"
+    fit: 250124_dis_ict_lam04_fs_102661
+    dataset_inputs:
+      from_: fit
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    use_t0: True
+    t0pdfset: 210223-mw-000_fakepdf
+    use_cuts: "internal"
+    explained_variance_ratio: 0.99
+    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    fits:
+      - 250124_dis_ict_lam04_fs_102661
+      - 250124_dis_ict_lam04_fs_166247
+      - 250124_dis_ict_lam04_fs_194869
+      - 250124_dis_ict_lam04_fs_235399
+      - 250124_dis_ict_lam04_fs_259799
+      - 250124_dis_ict_lam04_fs_280552
+      - 250124_dis_ict_lam04_fs_362871
+      - 250124_dis_ict_lam04_fs_38346
+      - 250124_dis_ict_lam04_fs_392751
+      - 250124_dis_ict_lam04_fs_394227
+      - 250124_dis_ict_lam04_fs_473619
+      - 250124_dis_ict_lam04_fs_475803
+      - 250124_dis_ict_lam04_fs_5238
+      - 250124_dis_ict_lam04_fs_544453
+      - 250124_dis_ict_lam04_fs_547187
+      - 250124_dis_ict_lam04_fs_561201
+      - 250124_dis_ict_lam04_fs_630491
+      - 250124_dis_ict_lam04_fs_691489
+      - 250124_dis_ict_lam04_fs_717426
+      - 250124_dis_ict_lam04_fs_833288
+      - 250124_dis_ict_lam04_fs_85029
+      - 250124_dis_ict_lam04_fs_856511
+      - 250124_dis_ict_lam04_fs_895365
+      - 250124_dis_ict_lam04_fs_902027
+      - 250124_dis_ict_lam04_fs_911964
+      - 250124_dis_ict_lam04_fs_99031
+    
+
+template: multiclosure_template.md
+comparefits_report:
+  meta: Null
+  template: comparefits_template.md
+single_point_report:
+  meta: Null
+  template: single_point_template.md
+pca_report:
+  meta: Null
+  template: pca_template.md
+actions_:
+  - report(main=true)
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
new file mode 100644
index 0000000000..7ad062bd09
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
@@ -0,0 +1,14 @@
+{@ with compare_settings @}
+[Comparefits report]({@comparefits_report report@})
+{@ endwith @}
+
+Single data point analysis
+--------------------------
+{@with lambdavalues@}
+[Lambda value of {@label@}]({@single_point_report report@})
+{@endwith@}
+Bias Variance Ratio Table
+-------------------------
+{@with lambdavalues@}
+[Lambda value of {@label@}]({@pca_report report@})
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
new file mode 100644
index 0000000000..6269098614
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -0,0 +1,2 @@
+# PCA analysis
+{@table_bias_variance_datasets@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
new file mode 100644
index 0000000000..54cdc4bd87
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
@@ -0,0 +1,24 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# PDF plots
+
+## PDF comparison
+{@with PDFnormalize@}
+### {@Normtitle@}
+{@with Basespecs@}
+#### {@Basistitle@}
+{@with PDFscalespecs@}
+##### {@Xscaletitle@}
+{@plot_pdfs@}
+{@endwith@}
+{@endwith@}
+{@endwith@}
+
+## PDF replicas
+{@with Basespecs@}
+#### {@Basistitle@}
+{@with PDFscalespecs@}
+##### {@Xscaletitle@}
+{@plot_pdfreplicas@}
+{@endwith@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
new file mode 100644
index 0000000000..8a8e5e8808
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
@@ -0,0 +1,2 @@
+# Single data point analysis
+{@xq2_data_prcs_maps@}
diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py
new file mode 100644
index 0000000000..5d1aa7013a
--- /dev/null
+++ b/validphys2/src/validphys/scripts/vp_multiclosure.py
@@ -0,0 +1,159 @@
+import sys
+import os
+import logging
+
+#TODO: Look into making these lazy imports
+import prompt_toolkit
+from prompt_toolkit.completion import WordCompleter
+
+from reportengine.compat import yaml
+from reportengine.colors import t
+
+from validphys.app import App
+from validphys.loader import RemoteLoader
+from validphys import compareinconsistentclosuretemplates
+from validphys.promptutils import confirm, KeywordsWithCache
+
+log = logging.getLogger(__name__)
+
+CURRENT_FIT_LABEL_DEFAULT = "Current Fit"
+REFERENCE_FIT_LABEL_DEFAULT = "Reference Fit"
+
+
+
+class CompareFitApp(App):
+    def add_positional_arguments(self, parser):
+        parser.add_argument(
+            'current_fit',
+            default=None,
+            nargs='?',
+            help="The fit to produce the report for.",
+        )
+        parser.add_argument(
+            'reference_fit',
+            default=None,
+            nargs='?',
+            help="The fit to compare with.")
+        # Group together mandatory arguments that are not positional
+        mandatory = parser.add_argument_group("mandatory", "Mandatory command line arguments")
+        mandatory.add_argument(
+            '--title', help="The title that will be indexed with the report.")
+        mandatory.add_argument('--author', help="The author of the report.")
+        mandatory.add_argument(
+            '--keywords', nargs='+', help="keywords to index the report with.")
+        parser.add_argument(
+            '--current_fit_label',
+            nargs='?',
+            default=CURRENT_FIT_LABEL_DEFAULT,
+            help="The label for the fit that the report is being produced for.",
+        )
+        parser.add_argument(
+            '--reference_fit_label',
+            nargs='?',
+            default=REFERENCE_FIT_LABEL_DEFAULT,
+            help="The label for the fit that is being compared to.")
+        parser.add_argument(
+            '--lambdasettings_path',
+            help="The path to the yaml file containing the settings for the lambdavalues report.")
+        
+        parser.set_defaults()
+
+    def try_complete_args(self):
+        args = self.args
+        argnames = (
+            'current_fit', 'reference_fit', 'title', 'author', 'keywords')
+        optionalnames = (
+            'current_fit_label', 'reference_fit_label')
+        badargs = [argname for argname in argnames if not args[argname]]
+        bad = badargs
+        if bad:
+            sys.exit(f"The following arguments are required: {bad}")
+        texts = '\n'.join(
+            f'    {argname.replace("_", " ").capitalize()}: {args[argname]}'
+            for argname in [*argnames, *optionalnames])
+        log.info(f"Starting NNPDF fit comparison:\n{texts}")
+
+    def get_commandline_arguments(self, cmdline=None):
+        args = super().get_commandline_arguments(cmdline)
+        # This is needed because the environment wants to know how to resolve
+        # the relative paths to find the templates. Best to have the template
+        # look as much as possible as a runcard passed from the command line
+        args['config_yml'] = compareinconsistentclosuretemplates.template_path
+        return args
+
+    def complete_mapping(self):
+        args = self.args
+        autosettings = {}
+        shortsettings = {}
+        autosettings['meta'] = {
+            'title': args['title'],
+            'author': args['author'],
+            'keywords': args['keywords']
+        }
+        currentmap = {'id': args['current_fit'], 'label': args['current_fit_label']}
+        shortsettings['current'] = {
+            'fit': currentmap,
+            'pdf': currentmap,
+            'theory': {
+                'from_': 'fit'
+            },
+            'theoryid': {
+                'from_': 'theory'
+            },
+            'speclabel': args['current_fit_label']
+        }
+        refmap = {'id': args['reference_fit'], 'label': args['reference_fit_label']}
+        shortsettings['reference'] = {
+            'fit': refmap,
+            'pdf': refmap,
+            'theory': {
+                'from_': 'fit'
+            },
+            'theoryid': {
+                'from_': 'theory'
+            },
+            'speclabel': args['reference_fit_label']
+        }
+        autosettings["compare_settings"] = shortsettings
+        return autosettings
+
+    def complete_lambdavaluesmapping(self):
+        args = self.args
+        # opening conf file
+        with open(args['lambdasettings_path']) as f:
+            settings = yaml.safe_load(f)
+        autosettings = {}
+        list_lambdas = []
+        for lambdas in settings:
+            lambdasetting = {}
+            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+                lambdasetting[set] = lambdas[set]
+            list_lambdas.append(lambdasetting)
+        autosettings["lambdavalues"] = list_lambdas
+        return autosettings
+            
+    def get_config(self):
+        self.try_complete_args()
+        #No error handling here because this is our internal file
+        with open(self.args['config_yml']) as f:
+            #TODO: Ideally this would load round trip but needs
+            #to be fixed in reportengine.
+            c = yaml.safe_load(f)
+        complete_mapping = self.complete_mapping()
+        complete_mapping.update(self.complete_lambdavaluesmapping())
+        c["meta"] = complete_mapping["meta"]
+        for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]):
+            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+                lambdas[set] = lambdas_mapping[set]
+        for set in ["current", "reference"]:
+            c["compare_settings"][set] = complete_mapping["compare_settings"][set]
+        return self.config_class(c, environment=self.environment)
+
+
+def main():
+    a = CompareFitApp()
+    a.main()
+
+
+if __name__ == '__main__':
+    main()

From 133747610dbde87088e3b31d18d84fb4f9007ee7 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Mon, 26 Feb 2024 10:48:48 +0100
Subject: [PATCH 11/14] Add sklearn to pyproject

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index d1f148e07b..0562e82f22 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,6 +97,7 @@ fiatlux = {version = "*", optional = true}
 # without lhapdf
 pdfflow = {version = "^1.2.1", optional = true}
 lhapdf-management = {version = "^0.5", optional = true}
+scikit-learn = "^1.4.1"
 
 # Optional dependencies
 [tool.poetry.extras]

From dfa13a2934d08677270a9c76403902fa8adcae9e Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Mon, 26 Feb 2024 11:33:48 +0100
Subject: [PATCH 12/14] Scikit added to conda

---
 conda-recipe/meta.yaml | 1 +
 pyproject.toml         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index a13244afa0..318febc365 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -59,6 +59,7 @@ requirements:
         - fiatlux
         - frozendict  # needed for caching of data loading
         - curio >=1.0 # reportengine uses it but it's not in its dependencies
+        - scikit-learn = "^1.4.1"
 
 test:
     requires:
diff --git a/pyproject.toml b/pyproject.toml
index 0562e82f22..99904644c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,6 +69,7 @@ pandas = "<2"
 numpy = "*"
 validobj = "*"
 prompt_toolkit = "*"
+scikit-learn = "^1.4.1"
 frozendict = "*"  # validphys: needed for caching of data loading
 # Reportengine (and its dependencies) need to be installed in a bit more manual way
 reportengine = { git = "https://github.com/NNPDF/reportengine", rev = "3bb2b1d"}
@@ -97,7 +98,6 @@ fiatlux = {version = "*", optional = true}
 # without lhapdf
 pdfflow = {version = "^1.2.1", optional = true}
 lhapdf-management = {version = "^0.5", optional = true}
-scikit-learn = "^1.4.1"
 
 # Optional dependencies
 [tool.poetry.extras]

From 7b1183fc6c5b53edb56f10bee05d3c8baf7be088 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Mon, 26 Feb 2024 11:34:31 +0100
Subject: [PATCH 13/14] Correct conda recipe

---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 318febc365..e07586bf79 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -59,7 +59,7 @@ requirements:
         - fiatlux
         - frozendict  # needed for caching of data loading
         - curio >=1.0 # reportengine uses it but it's not in its dependencies
-        - scikit-learn = "^1.4.1"
+        - scikit-learn >= 1.4.1
 
 test:
     requires:

From a4eabd8d776464cabce4ce3ed0635675a57c65f5 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Mon, 26 Feb 2024 11:37:30 +0100
Subject: [PATCH 14/14] Correct conda recipe

---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index e07586bf79..bac4fabb10 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -59,7 +59,7 @@ requirements:
         - fiatlux
         - frozendict  # needed for caching of data loading
         - curio >=1.0 # reportengine uses it but it's not in its dependencies
-        - scikit-learn >= 1.4.1
+        - scikit-learn >=1.4.1
 
 test:
     requires: