From 91b1ee5453e2f5b7a6c15064e0462f36e033cf7e Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 15:55:15 +0100 Subject: [PATCH 01/14] Init analysis --- validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 295bcd30fb..f4869367c4 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -97,6 +97,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True): all_kinlimits = collect(kinlimits, ('dataset_inputs',)) +def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + Load in a dictionary all the specs of a dataset meaning: + - ds name + - ds coords + - standard deviation (in multiclosure) + - mean (in multiclosure again) + - (x,Q^2) coords + """ + xq2_map_obj = xq2map_with_cuts(commondata, cuts) + coords = xq2_map_obj[2] + central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) + std_devs = np.std(central_deltas, axis = 0) + means = np.mean(central_deltas, axis = 0) + xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + #import ipdb; ipdb.set_trace() + # for case of DY observables we have 2 (x,Q) for each experimental point + if coords[0].shape[0] != std_devs.shape[0]: + std_devs = np.concatenate((std_devs,std_devs)) + means = np.concatenate((means,means)) + xi = np.concatenate((xi,xi)) + return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} @table def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True): @@ -186,6 +210,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None): 'data_input', ), ) +xq2_data_map = collect("xq2_dataset_map",("data",)) + +@figuregen +def xq2_data_prcs_maps(xq2_data_map): + import matplotlib.pyplot as plt + import matplotlib.colors + from matplotlib.colors import ListedColormap + keys = ["std_devs","xi"] + for elem in xq2_data_map: + for k in keys: + fig, ax = plotutils.subplots() + im = ax.scatter(elem['x_coords'],elem['Q_coords'], + c=(np.asarray(elem[k])), + cmap='viridis', + label = elem['name']) + fig.colorbar(im,label=k) + ax.set_xscale('log') # Set x-axis to log scale + ax.set_yscale('log') # Set y-axis to log scale + ax.set_xlabel('x') + ax.set_ylabel('Q2') + ax.set_title(elem["name"]+" "+k) + yield fig def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False): From ab5d6d14d6ad1c1ebe6c440c9f31988422847f49 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:15:05 +0100 Subject: [PATCH 02/14] Add other functions --- .../multiclosure_inconsistent.py | 362 ++++++++++++++++++ .../multiclosure_inconsistent_output.py | 54 +++ .../src/validphys/closuretest/multiclosure.py | 35 ++ validphys2/src/validphys/kinematics.py | 7 +- 4 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py new file mode 100644 index 0000000000..4cb12f2d30 --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -0,0 +1,362 @@ +""" +closuretest/inconsistent_closuretest/multiclosure_inconsistent.py + +Module containing all of the statistical estimators which are +averaged across multiple inconsistent fits. The actions +in this module are used to produce results which are plotted in +``multiclosure_inconsistent_output.py`` + +""" + +import numpy as np +from sklearn.decomposition import PCA +from sklearn import preprocessing + +from validphys import covmats +from validphys.calcutils import calc_chi2 +from validphys.results import ThPredictionsResult + +from reportengine import collect + + +""" To load several multiclosure fits. Useful for inconsistent closure test analysis """ +multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",)) + +multi_dataset_fits_bias_replicas_variance_samples = collect( + "dataset_fits_bias_replicas_variance_samples", ("dataspecs",) +) + +multi_fits_bootstrap_dataset_bias_variance = collect( + "fits_bootstrap_dataset_bias_variance", ("dataspecs",) +) + +multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",)) + +multi_dataset_fits_bias_variance_samples_pca = collect( + "dataset_fits_bias_variance_samples_pca", ("dataspecs",) +) + + +def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93): + """ + Compute the principal components of theory predictions replica matrix + (Ndat x Nrep feature matrix). + + Parameters + ---------- + dataset: (DataSetSpec, DataGroupSpec) + dataset for which the theory predictions and t0 covariance matrix + will be loaded. Note that due to the structure of `validphys` this + function can be overloaded to accept a DataGroupSpec. + + fits_pdf: list + list of PDF objects produced from performing multiple closure tests + fits. Each fit should have a different filterseed but the same + underlying law used to generate the pseudodata. + + variancepdf: validphys.core.PDF + PDF object used to estimate the variance of the fits. + + explained_variance_ratio: float, default is 0.93 + + Returns + ------- + tuple + 2D tuple: + - matrix of the principal components (PCs) of shape (N_pc, N_dat) + - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep) + + """ + # fits_dataset_predictions = [ + # ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf + # ] + + # dimensions here are (Nfits, Ndat, Nrep) + # reps = np.asarray([th.error_members for th in fits_dataset_predictions]) + + # reshape so as to get PCs from all the samples + # reps = reps.reshape(reps.shape[1],-1) + + # get replicas from variance fit, used to estimate variance + reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members + + # rescale feature matrix + reps_scaled = reps # preprocessing.scale(reps) + + # choose number of principal components (PCs) based on explained variance ratio + n_comp = 1 + for _ in range(reps.shape[0]): + pca = PCA(n_comp).fit(reps_scaled.T) + if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio: + break + n_comp += 1 + + # project feature matrix onto PCs + pc_reps = pca.components_ @ reps + + return pca.components_, pc_reps, n_comp + + +def principal_components_bias_variance_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <=1: + return None, None, n_comp + + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis] + # shape here is (Npc, Nfits) + delta_bias = pc_basis @ delta_bias + + # compute biases, shape of biases is (Nfits) + biases = calc_chi2(sqrt_covmat_pdf, delta_bias) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) + + return biases, np.asarray(variances), n_comp + + +principal_components_bias_variance_datasets = collect( + "principal_components_bias_variance_dataset", ("data",) +) + + +def principal_components_variance_distribution_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, _, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <= 1: + return None, n_comp + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append([calc_chi2(sqrt_covmat_pdf, diffs)]) + + return np.asarray(variances), n_comp + + +principal_components_variance_distribution_datasets = collect( + "principal_components_variance_distribution_dataset", ("data",) +) + + +def compute_num_components(covariance_matrix, threshold=0.99): + """ + Compute the number of principal components to keep based on a desired explained variance threshold. + + Parameters + ---------- + covariance_matrix : np.ndarray, 2-D array + + threshold : (float): Desired explained variance threshold (between 0 and 1). + + Returns + ------- + int + num_components: Number of principal components to keep. + + """ + eig_val, eig_vec = np.linalg.eig(covariance_matrix) + idx = eig_val.argsort()[::-1] + eig_val = eig_val[idx] + eig_vec = eig_vec[:, idx] + + cumulative_sum = np.cumsum(eig_val) + total_sum = np.sum(eig_val) + num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold)) + + return num_components + + +def pca_covmat(X, num_components): + """ + given data X of shape (n,p), reduce its dimension to + (n,num_components) and return the covariance matrix + of the reduced data matrix. + + Parameters + ---------- + + Returns + ------- + """ + pca = PCA(num_components) + X_reduced = pca.fit_transform(X) + covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1) + return covariance + + +def calc_chi2_pca(pdf_cov, diff, num_components): + """ + Computes the chi2 between pdf_cov and diff by first projecting + them into num_components PCs. + + Parameters + ---------- + pdf_cov: np.ndarray + + diff: np.ndarray + + num_components: int + + Returns + ------- + float or np.ndarray depending on input + + """ + # Diagonalize the matrix + L, W = np.linalg.eig(pdf_cov) + + # Sort the eigenvalues and eigenvectors from largest to smallest + idx = L.argsort()[::-1] + L = L[idx] + W = W[:, idx] + + # Keep only the n_components largest eigenvectors + Wtilde = W[:, :num_components] + + # Transform initial covariance matrix + pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real + + # transform data + diff_pca = diff.T @ Wtilde + + try: + sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca) + except: + raise ValueError( + f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept" + ) + + return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T)) + + +def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99): + """ + similar to `dataset_fits_bias_replicas_variance_samples`. + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_eig: number of eigenvalues kept in PCA, i.e., + ndata in the new, lower dimensional, space. + Note that we return Nfits values of the variance so that computing the + Bias - Variance ratio is straightforward. + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + # compute the PDF covariance matrix of the central samples + if centrals.shape[0] <= 1: + raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix") + + pdf_cov_bias = np.cov(centrals.T) + + # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues) + n_eig = compute_num_components(pdf_cov_bias, threshold) + + # compute bias from PCs + diffs_bias = centrals.T - law_th.central_value[:, np.newaxis] + biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig) + + # compute variances from PCs + variances = [] + + # loop over fits to compute variances + for i in range(reps.shape[0]): + diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + pdf_cov_var = np.cov(reps[i, :, :]) + + variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig))) + + return biases, np.asarray(variances), n_eig + + +def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ + returns the bias and variance expected values as well as the number of + principal components. + """ + biases, variances, n_eig = dataset_fits_bias_variance_samples_pca + return np.mean(biases), np.mean(variances), n_eig + + +expected_datasets_fits_bias_variance_samples_pca = collect( + "expected_dataset_fits_bias_variance_samples_pca", ("data",) +) + + +def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ """ + biases, variances, _ = dataset_fits_bias_variance_samples_pca + sqrt_ratios = np.sqrt(biases / variances) + return sqrt_ratios + + +def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99): + """ + returns parameters of multi gaussian distribution of replicas + and central replicas + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + centrals_covmat = np.cov(centrals.T) + centrals_covmat = pca_covmat( + centrals, num_components=compute_num_components(centrals_covmat, threshold) + ) + mean_centrals = np.mean(centrals, axis=0) + + replicas_covmat = 0 + for i in range(reps.shape[0]): + replicas_covmat = np.cov(reps[i, :, :]) + replicas_covmat += pca_covmat( + reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold) + ) + replicas_covmat /= reps.shape[0] + mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1) + + return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py new file mode 100644 index 0000000000..2f057510ab --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -0,0 +1,54 @@ +import numpy as np +import pandas as pd +from scipy import stats + +from reportengine.table import table + +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@table +def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + records = [] + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + bias = np.mean(biases) + variance = np.mean(variances) + rbv = bias / variance + sqrt_rbv = np.sqrt(bias / variance) + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=rbv, + ratio_sqrt=sqrt_rbv, + ) + ) + + except: + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=np.nan, + ratio_sqrt=np.nan, + )) + + + + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] + return df \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 5b545569e2..64d26c4d69 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -147,6 +147,41 @@ def fits_dataset_bias_variance( variances.append(np.mean(calc_chi2(sqrtcov, diffs))) return biases, np.asarray(variances), len(law_th) +@check_multifit_replicas +def fits_normed_dataset_central_delta( + internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + For each fit calculate the difference between central expectation value and true val. Normalize this + value by the variance of the differences between replicas and central expectation value (different + for each fit but expected to vary only a little). Each observable central exp value is + expected to be gaussianly distributed around the true value set by the fakepdf + """ + closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader + # The dimentions here are (fit, data point, replica) + #import ipdb; ipdb.set_trace() + reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) + # One could mask here some reps in order to avoid redundancy of information + #TODO + + n_data = len(law_th) + n_fits = np.shape(reps)[0] + deltas = [] + # There are n_fits pdf_covariances + # flag to see whether to eliminate dataset + for i in range(n_fits): + bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value + # bias diffs in the for loop should have dim = (n_obs) + sigmas = np.sqrt(np.var(reps[i], axis = 1)) + # var diffs has shape (n_obs,reps) + bias_diffs = bias_diffs/sigmas + + deltas.append(bias_diffs.tolist()) + # biases.shape = (n_fits, n_obs_cut/uncut) + # variances.shape = (n_fits, n_obs_cut/uncut, reps) + #import ipdb; ipdb.set_trace() + return np.asarray(deltas) def expected_dataset_bias_variance(fits_dataset_bias_variance): """For a given dataset calculate the expected bias and variance across fits diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index f4869367c4..faf5acdbdb 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -13,8 +13,14 @@ from reportengine import collect from reportengine.checks import check_positive from reportengine.table import table +from validphys import plotutils from validphys import plotoptions from validphys.core import CutsPolicy +from validphys.closuretest import fits_normed_dataset_central_delta +from validphys.closuretest import dataset_xi +from validphys.closuretest import dataset_replica_and_central_diff + +from reportengine.figure import figuregen log = logging.getLogger(__name__) @@ -114,7 +120,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, std_devs = np.std(central_deltas, axis = 0) means = np.mean(central_deltas, axis = 0) xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) - #import ipdb; ipdb.set_trace() # for case of DY observables we have 2 (x,Q) for each experimental point if coords[0].shape[0] != std_devs.shape[0]: std_devs = np.concatenate((std_devs,std_devs)) From f9c5ade4a7525d81401455849152f3e5ba8b6890 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:33:18 +0100 Subject: [PATCH 03/14] Add other functions again --- .../multiclosure_inconsistent_output.py | 292 +++++++++++++++++- 1 file changed, 290 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 2f057510ab..33310361d1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -1,11 +1,87 @@ -import numpy as np +""" +multiclosure_inconsistent_output + +Module containing the actions which produce some output in validphys +reports i.e figures or tables for (inconsistent) multiclosure +estimators in the space of data + +""" import pandas as pd +import numpy as np from scipy import stats +from reportengine.figure import figure, figuregen from reportengine.table import table +from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@figuregen +def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances)) + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue + fig, ax = plotutils.subplots() + ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}") + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}") + + ax.legend() + + yield fig + + except: + fig, ax = plotutils.subplots() + ax.plot([], [], label=f"Dataset: {str(ds)}") + ax.legend() + yield fig + +@figuregen +def plot_variance_distribution_datasets( + principal_components_variance_distribution_datasets, each_dataset +): + """ + TODO + """ + + for pc_var_dataset, ds in zip( + principal_components_variance_distribution_datasets, each_dataset + ): + variances, n_comp = pc_var_dataset + try: + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + + for i, var_fit in enumerate(variances): + pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue + + fig, ax = plotutils.subplots() + ax.hist( + var_fit[0], + density=True, + bins='auto', + label=f"Variance {str(ds)}, p={pvalue_ks:.3f}", + ) + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.set_title(f"Fit {i}") + ax.legend() + + yield fig + except: + fig, ax = plotutils.subplots() + yield fig + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), ) df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] - return df \ No newline at end of file + return df + + +@table +def datasets_bias_variance_pca_table( + expected_datasets_fits_bias_variance_samples_pca, each_dataset +): + """For each dataset calculate the expected bias and expected variance computed by + first projecting the covariance matrix into a lower dimensional space trough PCA. + + """ + records = [] + for ds, (bias, var, ndata) in zip( + each_dataset, expected_datasets_fits_bias_variance_samples_pca + ): + records.append( + dict( + dataset=str(ds), + ndata=ndata, + bias=bias, + variance=var, + ratio=bias / var, + ratio_sqrt=np.sqrt(bias / var), + ) + ) + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"] + return df + + +@figure +def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca): + """""" + sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca + fig, ax = plotutils.subplots() + ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio") + ax.legend() + return fig + + +@figure +def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the variances (k) defined as the + distance between central replica and replica (k) in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (_, variance_dist, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the biases (l) defined as the + distance between central replica and underlying law in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias_dist, _, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + histogram of the distribution of the sqrt ratio of bias and variance computed with + the estimated "PDF" covariance matrix (reduced with PCA). + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + ratios = [] + labels = [] + for (bias_dist, variance_dist, _), spec in zip( + multi_dataset_fits_bias_variance_samples_pca, dataspecs + ): + labels.append(spec['speclabel']) + ratios.append(bias_dist / variance_dist) + + ax.hist(ratios, bins='auto', density=True, label=labels) + ax.legend() + return fig + + +@figure +def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + """ + histogram of the distribution of the biases obtained by bootstrapping with + `fits_bootstrap_dataset_bias_variance` over the existing fits. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + label = spec['speclabel'] + + ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_multi_ratio_bias_variance_distribution_bootstrap( + multi_fits_bootstrap_dataset_bias_variance, dataspecs +): + """ + histogram of the ratio bias variance distribution obtained by bootstrapping bias + and variance with `fits_bootstrap_dataset_bias_variance`. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + ratio = bias / variance + label = spec['speclabel'] + + ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + +@figuregen +def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): + """ + TODO + """ + + evr_range = np.linspace(0.90, 0.995, 10) + + + for ds in each_dataset: + l2_cond = [] + dof = [] + + for evr in evr_range: + _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr) + + covmat_pdf = np.cov(pc_reps) + + # compute condition number + l2_cond.append(np.linalg.cond(covmat_pdf)) + dof.append(n_comp) + + + + fig, ax1 = plotutils.subplots(figsize=(15,4)) + ax1.plot(evr_range, l2_cond, "b-o", label="condition number") + ax1.set_title(f"Dataset: {str(ds)}") + ax1.set_xlabel("EVR") + ax1.set_ylabel("Covariance Matrix Condition Number", color="b") + ax1.tick_params('y', color="b") + + ax2 = ax1.twinx() + + # Plot the second dataset on the right y-axis + ax2.plot(evr_range, dof, 'r-o', label="dof") + ax2.set_ylabel('Degrees of freedom', color="r") + ax2.tick_params('y', color="r") + # ax1.legend() + # ax2.legend() + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax2.get_legend_handles_labels() + lines = lines1 + lines2 + labels = labels1 + labels2 + ax1.legend(lines, labels, loc='upper left') + + + yield fig \ No newline at end of file From 813f019697f35a2a95cfe120c69cb1db527a205e Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Wed, 21 Feb 2024 11:49:50 +0100 Subject: [PATCH 04/14] Add parse_variancepdf --- validphys2/src/validphys/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 0add44c70c..042739b73e 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -1036,6 +1036,9 @@ def produce_theoryid(self, theory): "Expected that key to be present.") return theory['theoryid'] """ + def parse_variancepdf(self, name): + """PDF set used to generate the t0 covmat.""" + return self.parse_pdf(name) def produce_pdf_id(self, pdf) -> str: """Return a string containing the PDF's LHAPDF ID""" From 7f8229fbd04c1b8ab7f464addcdfe02124565e55 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 15:55:15 +0100 Subject: [PATCH 05/14] Init analysis --- validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 295bcd30fb..f4869367c4 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -97,6 +97,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True): all_kinlimits = collect(kinlimits, ('dataset_inputs',)) +def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + Load in a dictionary all the specs of a dataset meaning: + - ds name + - ds coords + - standard deviation (in multiclosure) + - mean (in multiclosure again) + - (x,Q^2) coords + """ + xq2_map_obj = xq2map_with_cuts(commondata, cuts) + coords = xq2_map_obj[2] + central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) + std_devs = np.std(central_deltas, axis = 0) + means = np.mean(central_deltas, axis = 0) + xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + #import ipdb; ipdb.set_trace() + # for case of DY observables we have 2 (x,Q) for each experimental point + if coords[0].shape[0] != std_devs.shape[0]: + std_devs = np.concatenate((std_devs,std_devs)) + means = np.concatenate((means,means)) + xi = np.concatenate((xi,xi)) + return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} @table def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True): @@ -186,6 +210,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None): 'data_input', ), ) +xq2_data_map = collect("xq2_dataset_map",("data",)) + +@figuregen +def xq2_data_prcs_maps(xq2_data_map): + import matplotlib.pyplot as plt + import matplotlib.colors + from matplotlib.colors import ListedColormap + keys = ["std_devs","xi"] + for elem in xq2_data_map: + for k in keys: + fig, ax = plotutils.subplots() + im = ax.scatter(elem['x_coords'],elem['Q_coords'], + c=(np.asarray(elem[k])), + cmap='viridis', + label = elem['name']) + fig.colorbar(im,label=k) + ax.set_xscale('log') # Set x-axis to log scale + ax.set_yscale('log') # Set y-axis to log scale + ax.set_xlabel('x') + ax.set_ylabel('Q2') + ax.set_title(elem["name"]+" "+k) + yield fig def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False): From 36863c3153b039592e5b4b7f3bd05d6d88f1b701 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:15:05 +0100 Subject: [PATCH 06/14] Add other functions --- .../multiclosure_inconsistent.py | 362 ++++++++++++++++++ .../multiclosure_inconsistent_output.py | 54 +++ .../src/validphys/closuretest/multiclosure.py | 35 ++ validphys2/src/validphys/kinematics.py | 7 +- 4 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py new file mode 100644 index 0000000000..4cb12f2d30 --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -0,0 +1,362 @@ +""" +closuretest/inconsistent_closuretest/multiclosure_inconsistent.py + +Module containing all of the statistical estimators which are +averaged across multiple inconsistent fits. The actions +in this module are used to produce results which are plotted in +``multiclosure_inconsistent_output.py`` + +""" + +import numpy as np +from sklearn.decomposition import PCA +from sklearn import preprocessing + +from validphys import covmats +from validphys.calcutils import calc_chi2 +from validphys.results import ThPredictionsResult + +from reportengine import collect + + +""" To load several multiclosure fits. Useful for inconsistent closure test analysis """ +multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",)) + +multi_dataset_fits_bias_replicas_variance_samples = collect( + "dataset_fits_bias_replicas_variance_samples", ("dataspecs",) +) + +multi_fits_bootstrap_dataset_bias_variance = collect( + "fits_bootstrap_dataset_bias_variance", ("dataspecs",) +) + +multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",)) + +multi_dataset_fits_bias_variance_samples_pca = collect( + "dataset_fits_bias_variance_samples_pca", ("dataspecs",) +) + + +def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93): + """ + Compute the principal components of theory predictions replica matrix + (Ndat x Nrep feature matrix). + + Parameters + ---------- + dataset: (DataSetSpec, DataGroupSpec) + dataset for which the theory predictions and t0 covariance matrix + will be loaded. Note that due to the structure of `validphys` this + function can be overloaded to accept a DataGroupSpec. + + fits_pdf: list + list of PDF objects produced from performing multiple closure tests + fits. Each fit should have a different filterseed but the same + underlying law used to generate the pseudodata. + + variancepdf: validphys.core.PDF + PDF object used to estimate the variance of the fits. + + explained_variance_ratio: float, default is 0.93 + + Returns + ------- + tuple + 2D tuple: + - matrix of the principal components (PCs) of shape (N_pc, N_dat) + - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep) + + """ + # fits_dataset_predictions = [ + # ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf + # ] + + # dimensions here are (Nfits, Ndat, Nrep) + # reps = np.asarray([th.error_members for th in fits_dataset_predictions]) + + # reshape so as to get PCs from all the samples + # reps = reps.reshape(reps.shape[1],-1) + + # get replicas from variance fit, used to estimate variance + reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members + + # rescale feature matrix + reps_scaled = reps # preprocessing.scale(reps) + + # choose number of principal components (PCs) based on explained variance ratio + n_comp = 1 + for _ in range(reps.shape[0]): + pca = PCA(n_comp).fit(reps_scaled.T) + if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio: + break + n_comp += 1 + + # project feature matrix onto PCs + pc_reps = pca.components_ @ reps + + return pca.components_, pc_reps, n_comp + + +def principal_components_bias_variance_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <=1: + return None, None, n_comp + + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis] + # shape here is (Npc, Nfits) + delta_bias = pc_basis @ delta_bias + + # compute biases, shape of biases is (Nfits) + biases = calc_chi2(sqrt_covmat_pdf, delta_bias) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) + + return biases, np.asarray(variances), n_comp + + +principal_components_bias_variance_datasets = collect( + "principal_components_bias_variance_dataset", ("data",) +) + + +def principal_components_variance_distribution_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, _, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <= 1: + return None, n_comp + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append([calc_chi2(sqrt_covmat_pdf, diffs)]) + + return np.asarray(variances), n_comp + + +principal_components_variance_distribution_datasets = collect( + "principal_components_variance_distribution_dataset", ("data",) +) + + +def compute_num_components(covariance_matrix, threshold=0.99): + """ + Compute the number of principal components to keep based on a desired explained variance threshold. + + Parameters + ---------- + covariance_matrix : np.ndarray, 2-D array + + threshold : (float): Desired explained variance threshold (between 0 and 1). + + Returns + ------- + int + num_components: Number of principal components to keep. + + """ + eig_val, eig_vec = np.linalg.eig(covariance_matrix) + idx = eig_val.argsort()[::-1] + eig_val = eig_val[idx] + eig_vec = eig_vec[:, idx] + + cumulative_sum = np.cumsum(eig_val) + total_sum = np.sum(eig_val) + num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold)) + + return num_components + + +def pca_covmat(X, num_components): + """ + given data X of shape (n,p), reduce its dimension to + (n,num_components) and return the covariance matrix + of the reduced data matrix. + + Parameters + ---------- + + Returns + ------- + """ + pca = PCA(num_components) + X_reduced = pca.fit_transform(X) + covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1) + return covariance + + +def calc_chi2_pca(pdf_cov, diff, num_components): + """ + Computes the chi2 between pdf_cov and diff by first projecting + them into num_components PCs. + + Parameters + ---------- + pdf_cov: np.ndarray + + diff: np.ndarray + + num_components: int + + Returns + ------- + float or np.ndarray depending on input + + """ + # Diagonalize the matrix + L, W = np.linalg.eig(pdf_cov) + + # Sort the eigenvalues and eigenvectors from largest to smallest + idx = L.argsort()[::-1] + L = L[idx] + W = W[:, idx] + + # Keep only the n_components largest eigenvectors + Wtilde = W[:, :num_components] + + # Transform initial covariance matrix + pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real + + # transform data + diff_pca = diff.T @ Wtilde + + try: + sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca) + except: + raise ValueError( + f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept" + ) + + return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T)) + + +def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99): + """ + similar to `dataset_fits_bias_replicas_variance_samples`. + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_eig: number of eigenvalues kept in PCA, i.e., + ndata in the new, lower dimensional, space. + Note that we return Nfits values of the variance so that computing the + Bias - Variance ratio is straightforward. + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + # compute the PDF covariance matrix of the central samples + if centrals.shape[0] <= 1: + raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix") + + pdf_cov_bias = np.cov(centrals.T) + + # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues) + n_eig = compute_num_components(pdf_cov_bias, threshold) + + # compute bias from PCs + diffs_bias = centrals.T - law_th.central_value[:, np.newaxis] + biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig) + + # compute variances from PCs + variances = [] + + # loop over fits to compute variances + for i in range(reps.shape[0]): + diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + pdf_cov_var = np.cov(reps[i, :, :]) + + variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig))) + + return biases, np.asarray(variances), n_eig + + +def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ + returns the bias and variance expected values as well as the number of + principal components. + """ + biases, variances, n_eig = dataset_fits_bias_variance_samples_pca + return np.mean(biases), np.mean(variances), n_eig + + +expected_datasets_fits_bias_variance_samples_pca = collect( + "expected_dataset_fits_bias_variance_samples_pca", ("data",) +) + + +def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ """ + biases, variances, _ = dataset_fits_bias_variance_samples_pca + sqrt_ratios = np.sqrt(biases / variances) + return sqrt_ratios + + +def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99): + """ + returns parameters of multi gaussian distribution of replicas + and central replicas + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + centrals_covmat = np.cov(centrals.T) + centrals_covmat = pca_covmat( + centrals, num_components=compute_num_components(centrals_covmat, threshold) + ) + mean_centrals = np.mean(centrals, axis=0) + + replicas_covmat = 0 + for i in range(reps.shape[0]): + replicas_covmat = np.cov(reps[i, :, :]) + replicas_covmat += pca_covmat( + reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold) + ) + replicas_covmat /= reps.shape[0] + mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1) + + return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py new file mode 100644 index 0000000000..2f057510ab --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -0,0 +1,54 @@ +import numpy as np +import pandas as pd +from scipy import stats + +from reportengine.table import table + +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@table +def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + records = [] + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + bias = np.mean(biases) + variance = np.mean(variances) + rbv = bias / variance + sqrt_rbv = np.sqrt(bias / variance) + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=rbv, + ratio_sqrt=sqrt_rbv, + ) + ) + + except: + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=np.nan, + ratio_sqrt=np.nan, + )) + + + + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] + return df \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 5b545569e2..64d26c4d69 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -147,6 +147,41 @@ def fits_dataset_bias_variance( variances.append(np.mean(calc_chi2(sqrtcov, diffs))) return biases, np.asarray(variances), len(law_th) +@check_multifit_replicas +def fits_normed_dataset_central_delta( + internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + For each fit calculate the difference between central expectation value and true val. Normalize this + value by the variance of the differences between replicas and central expectation value (different + for each fit but expected to vary only a little). Each observable central exp value is + expected to be gaussianly distributed around the true value set by the fakepdf + """ + closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader + # The dimentions here are (fit, data point, replica) + #import ipdb; ipdb.set_trace() + reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) + # One could mask here some reps in order to avoid redundancy of information + #TODO + + n_data = len(law_th) + n_fits = np.shape(reps)[0] + deltas = [] + # There are n_fits pdf_covariances + # flag to see whether to eliminate dataset + for i in range(n_fits): + bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value + # bias diffs in the for loop should have dim = (n_obs) + sigmas = np.sqrt(np.var(reps[i], axis = 1)) + # var diffs has shape (n_obs,reps) + bias_diffs = bias_diffs/sigmas + + deltas.append(bias_diffs.tolist()) + # biases.shape = (n_fits, n_obs_cut/uncut) + # variances.shape = (n_fits, n_obs_cut/uncut, reps) + #import ipdb; ipdb.set_trace() + return np.asarray(deltas) def expected_dataset_bias_variance(fits_dataset_bias_variance): """For a given dataset calculate the expected bias and variance across fits diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index f4869367c4..faf5acdbdb 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -13,8 +13,14 @@ from reportengine import collect from reportengine.checks import check_positive from reportengine.table import table +from validphys import plotutils from validphys import plotoptions from validphys.core import CutsPolicy +from validphys.closuretest import fits_normed_dataset_central_delta +from validphys.closuretest import dataset_xi +from validphys.closuretest import dataset_replica_and_central_diff + +from reportengine.figure import figuregen log = logging.getLogger(__name__) @@ -114,7 +120,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, std_devs = np.std(central_deltas, axis = 0) means = np.mean(central_deltas, axis = 0) xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) - #import ipdb; ipdb.set_trace() # for case of DY observables we have 2 (x,Q) for each experimental point if coords[0].shape[0] != std_devs.shape[0]: std_devs = np.concatenate((std_devs,std_devs)) From 273e16b6db4eea7f9f38c473de394e7d31bff739 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:33:18 +0100 Subject: [PATCH 07/14] Add other functions again --- .../multiclosure_inconsistent_output.py | 292 +++++++++++++++++- 1 file changed, 290 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 2f057510ab..33310361d1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -1,11 +1,87 @@ -import numpy as np +""" +multiclosure_inconsistent_output + +Module containing the actions which produce some output in validphys +reports i.e figures or tables for (inconsistent) multiclosure +estimators in the space of data + +""" import pandas as pd +import numpy as np from scipy import stats +from reportengine.figure import figure, figuregen from reportengine.table import table +from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@figuregen +def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances)) + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue + fig, ax = plotutils.subplots() + ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}") + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}") + + ax.legend() + + yield fig + + except: + fig, ax = plotutils.subplots() + ax.plot([], [], label=f"Dataset: {str(ds)}") + ax.legend() + yield fig + +@figuregen +def plot_variance_distribution_datasets( + principal_components_variance_distribution_datasets, each_dataset +): + """ + TODO + """ + + for pc_var_dataset, ds in zip( + principal_components_variance_distribution_datasets, each_dataset + ): + variances, n_comp = pc_var_dataset + try: + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + + for i, var_fit in enumerate(variances): + pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue + + fig, ax = plotutils.subplots() + ax.hist( + var_fit[0], + density=True, + bins='auto', + label=f"Variance {str(ds)}, p={pvalue_ks:.3f}", + ) + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.set_title(f"Fit {i}") + ax.legend() + + yield fig + except: + fig, ax = plotutils.subplots() + yield fig + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), ) df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] - return df \ No newline at end of file + return df + + +@table +def datasets_bias_variance_pca_table( + expected_datasets_fits_bias_variance_samples_pca, each_dataset +): + """For each dataset calculate the expected bias and expected variance computed by + first projecting the covariance matrix into a lower dimensional space trough PCA. + + """ + records = [] + for ds, (bias, var, ndata) in zip( + each_dataset, expected_datasets_fits_bias_variance_samples_pca + ): + records.append( + dict( + dataset=str(ds), + ndata=ndata, + bias=bias, + variance=var, + ratio=bias / var, + ratio_sqrt=np.sqrt(bias / var), + ) + ) + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"] + return df + + +@figure +def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca): + """""" + sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca + fig, ax = plotutils.subplots() + ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio") + ax.legend() + return fig + + +@figure +def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the variances (k) defined as the + distance between central replica and replica (k) in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (_, variance_dist, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the biases (l) defined as the + distance between central replica and underlying law in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias_dist, _, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + histogram of the distribution of the sqrt ratio of bias and variance computed with + the estimated "PDF" covariance matrix (reduced with PCA). + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + ratios = [] + labels = [] + for (bias_dist, variance_dist, _), spec in zip( + multi_dataset_fits_bias_variance_samples_pca, dataspecs + ): + labels.append(spec['speclabel']) + ratios.append(bias_dist / variance_dist) + + ax.hist(ratios, bins='auto', density=True, label=labels) + ax.legend() + return fig + + +@figure +def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + """ + histogram of the distribution of the biases obtained by bootstrapping with + `fits_bootstrap_dataset_bias_variance` over the existing fits. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + label = spec['speclabel'] + + ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_multi_ratio_bias_variance_distribution_bootstrap( + multi_fits_bootstrap_dataset_bias_variance, dataspecs +): + """ + histogram of the ratio bias variance distribution obtained by bootstrapping bias + and variance with `fits_bootstrap_dataset_bias_variance`. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + ratio = bias / variance + label = spec['speclabel'] + + ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + +@figuregen +def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): + """ + TODO + """ + + evr_range = np.linspace(0.90, 0.995, 10) + + + for ds in each_dataset: + l2_cond = [] + dof = [] + + for evr in evr_range: + _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr) + + covmat_pdf = np.cov(pc_reps) + + # compute condition number + l2_cond.append(np.linalg.cond(covmat_pdf)) + dof.append(n_comp) + + + + fig, ax1 = plotutils.subplots(figsize=(15,4)) + ax1.plot(evr_range, l2_cond, "b-o", label="condition number") + ax1.set_title(f"Dataset: {str(ds)}") + ax1.set_xlabel("EVR") + ax1.set_ylabel("Covariance Matrix Condition Number", color="b") + ax1.tick_params('y', color="b") + + ax2 = ax1.twinx() + + # Plot the second dataset on the right y-axis + ax2.plot(evr_range, dof, 'r-o', label="dof") + ax2.set_ylabel('Degrees of freedom', color="r") + ax2.tick_params('y', color="r") + # ax1.legend() + # ax2.legend() + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax2.get_legend_handles_labels() + lines = lines1 + lines2 + labels = labels1 + labels2 + ax1.legend(lines, labels, loc='upper left') + + + yield fig \ No newline at end of file From 3c1f113e76e14a88a91dba6894ec0198c4f247c5 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Wed, 21 Feb 2024 11:49:50 +0100 Subject: [PATCH 08/14] Add parse_variancepdf --- validphys2/src/validphys/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 1b6ba79cf8..6caf36531b 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -1038,6 +1038,9 @@ def produce_theoryid(self, theory): "Expected that key to be present.") return theory['theoryid'] """ + def parse_variancepdf(self, name): + """PDF set used to generate the t0 covmat.""" + return self.parse_pdf(name) def produce_pdf_id(self, pdf) -> str: """Return a string containing the PDF's LHAPDF ID""" From af4a08f063524a011ecd209fd86051f88a7a4ea3 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Wed, 21 Feb 2024 15:11:04 +0100 Subject: [PATCH 09/14] Remove ipdbs and make sure multiclosure_inconsistent work --- validphys2/src/validphys/closuretest/__init__.py | 2 ++ validphys2/src/validphys/closuretest/multiclosure.py | 2 -- validphys2/src/validphys/config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/validphys2/src/validphys/closuretest/__init__.py b/validphys2/src/validphys/closuretest/__init__.py index 02950f15d6..38bc1228c5 100644 --- a/validphys2/src/validphys/closuretest/__init__.py +++ b/validphys2/src/validphys/closuretest/__init__.py @@ -11,3 +11,5 @@ from validphys.closuretest.multiclosure_pdf_output import * from validphys.closuretest.multiclosure_preprocessing import * from validphys.closuretest.multiclosure_pseudodata import * +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import * +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent_output import * diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 64d26c4d69..ff71e3caf0 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -160,7 +160,6 @@ def fits_normed_dataset_central_delta( """ closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader # The dimentions here are (fit, data point, replica) - #import ipdb; ipdb.set_trace() reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) # One could mask here some reps in order to avoid redundancy of information #TODO @@ -180,7 +179,6 @@ def fits_normed_dataset_central_delta( deltas.append(bias_diffs.tolist()) # biases.shape = (n_fits, n_obs_cut/uncut) # variances.shape = (n_fits, n_obs_cut/uncut, reps) - #import ipdb; ipdb.set_trace() return np.asarray(deltas) def expected_dataset_bias_variance(fits_dataset_bias_variance): diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 6caf36531b..14b2123dc7 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -1041,7 +1041,7 @@ def produce_theoryid(self, theory): def parse_variancepdf(self, name): """PDF set used to generate the t0 covmat.""" return self.parse_pdf(name) - + def produce_pdf_id(self, pdf) -> str: """Return a string containing the PDF's LHAPDF ID""" return pdf.name From 0b1e1e4836d8c309bfb9bb1053473decd9e0c9fa Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Thu, 22 Feb 2024 11:39:43 +0100 Subject: [PATCH 10/14] Add multiclosure comparefits --- pyproject.toml | 1 + .../__init__.py | 3 + .../comparefits_template.md | 121 +++++++++ .../data.md | 5 + .../exponents.md | 15 ++ .../lumi.md | 9 + .../multiclosure_analysis.yaml | 244 ++++++++++++++++++ .../multiclosure_template.md | 14 + .../pca_template.md | 2 + .../pdf.md | 24 ++ .../single_point_template.md | 2 + .../src/validphys/scripts/vp_multiclosure.py | 159 ++++++++++++ 12 files changed, 599 insertions(+) create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/data.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md create mode 100644 validphys2/src/validphys/scripts/vp_multiclosure.py diff --git a/pyproject.toml b/pyproject.toml index 73b5a6cda4..d1f148e07b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ wiki-upload = "validphys.scripts.wiki_upload:main" vp-get = "validphys.scripts.vp_get:main" vp-comparefits = "validphys.scripts.vp_comparefits:main" vp-fitrename = "validphys.scripts.vp_fitrename:main" +vp-multiclosure = "validphys.scripts.vp_multiclosure:main" vp-checktheory = "validphys.scripts.vp_checktheory:main" vp-rebuild-data = "validphys.scripts.vp_rebuild_data:main" vp-pdfrename = "validphys.scripts.vp_pdfrename:main" diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py new file mode 100644 index 0000000000..9a64b5152a --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py @@ -0,0 +1,3 @@ +import pathlib + +template_path = pathlib.Path(__file__).with_name('multiclosure_analysis.yaml') diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md new file mode 100644 index 0000000000..bafb0f5fa5 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md @@ -0,0 +1,121 @@ +%NNPDF report comparing {@ current fit_id @} and {@ reference fit_id @} + +Summary +------- + +We are comparing: + + - {@ current fit @} (`{@ current fit_id @}`): {@ current description @} + - {@ reference fit @} (`{@ reference fit_id @}`): {@ reference description @} + + +{@ summarise_fits @} + + +t0 losses +--------- +{@ dataspecs::t0_info t0_chi2_info_table @} + +Theory covariance summary +------------------------- +{@summarise_theory_covmat_fits@} + +Dataset properties +------------------ +{@current fit_datasets_properties_table@} + +Distances +--------- +{@with Scales@} +### {@Scaletitle@} +{@with Normalize::Basespecs::PDFscalespecs::Distspecs@} +#### {@Basistitle@}, {@Xscaletitle@} +{@plot_pdfdistances@} +{@plot_pdfvardistances@} +{@endwith@} +{@endwith@} + +PDF arc-lengths +--------------- +{@Basespecs plot_arc_lengths@} + +Sum rules +--------- +{@with pdfs@} +### {@pdf@} + +#### Known sum rules + +{@sum_rules_table@} + +#### Unknown sum rules + +{@unknown_sum_rules_table@} + +{@endwith@} + +PDF plots +--------- +{@with Scales@} +[Plots at {@Scaletitle@}]({@pdf_report report@}) +{@endwith@} + +Luminosities +------------ +{@with Energies@} +[Plots at {@Energytitle@}]({@lumi_report report@}) +{@endwith@} + +Effective exponents +------------------- +[Detailed information]({@exponents_report report@}) + +Training lengths +---------------- +{@fits plot_training_length@} + +Training-validation +------------------- +{@fits plot_training_validation@} + +{@with DataGroups@} +$\chi^2$ by {@processed_metadata_group@} +---------------------------------------- +{@plot_fits_groups_data_chi2@} +{@endwith@} + + +$\chi^2$ by dataset +------------------- +### Plot +{@plot_fits_datasets_chi2@} +### Table +{@ProcessGroup fits_chi2_table(show_total=true)@} + + +{@with DataGroups@} +$\phi$ by {@processed_metadata_group@} +-------------------------------------- +{@plot_fits_groups_data_phi@} +{@endwith@} + +Dataset plots +------------- +{@with matched_datasets_from_dataspecs@} +[Plots for {@dataset_name@}]({@dataset_report report@}) +{@endwith@} + +Positivity +---------- +{@with matched_positivity_from_dataspecs@} +{@plot_dataspecs_positivity@} +{@endwith@} + +Dataset differences and cuts +---------------------------- +{@print_dataset_differences@} +{@print_different_cuts@} + +Code versions +------------- +{@fits_version_table@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md new file mode 100644 index 0000000000..c99e62f3d4 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md @@ -0,0 +1,5 @@ +% Data-theory comparison for {@dataset_name@} +# Absolute +{@plot_fancy_dataspecs@} +# Normalized +{@Datanorm plot_fancy_dataspecs@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md new file mode 100644 index 0000000000..9736e70775 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md @@ -0,0 +1,15 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# Effective preprocessing information + +## Plots +### alpha exponent +{@current::basisfromfit plot_alpha_eff@} +### beta exponent +{@current::basisfromfit plot_beta_eff@} + +## Tables +{@with fits@} +### Next effective exponents for {@fit@} +{@effective_exponents_table@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md new file mode 100644 index 0000000000..a077a4a21f --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md @@ -0,0 +1,9 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# Luminosity plots + +{@with Normalize@} +{@with lumi_channels@} +{@plot_lumi1d@} +{@endwith@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml new file mode 100644 index 0000000000..dd4a2b92a5 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -0,0 +1,244 @@ +meta: + title: Multiclosure report + author: AB + keywords: [inconsistencies, multiclosure] + +#COMPAREFITS SETTINGS + +positivity: + from_: fit + +description: + from_: fit + +dataset_inputs: + from_: fit + +t0_info: + - use_t0: True + datacuts: + from_: fit + t0pdfset: + from_: datacuts +compare_settings: + current: + fit: {id: "240210_mnc_dis_ict_lam02"} + pdf: {id: "240210_mnc_dis_ict_lam02", label: "Current Fit"} + theory: + from_: fit + theoryid: + from_: theory + speclabel: "Current Fit" + + reference: + fit: {id: "240210_mnc_dis_ict_lam04"} + pdf: {id: "240210_mnc_dis_ict_lam04", label: "Reference Fit" } + theory: + from_: fit + theoryid: + from_: theory + speclabel: "Reference Fit" + + pdfs: + - from_: current + - from_: reference + + fits: + - from_: current + - from_: reference + + use_cuts: "fromfit" + use_weights_in_covmat: False + + Q: 1.651 + + Scales: + - Q: 1.651 + Scaletitle: "Q = 1.65 GeV" + - Q: 100 + Scaletitle: "Q = 100 GeV" + + PDFnormalize: + - Normtitle: Absolute + + - normalize_to: 2 + Normtitle: Ratio + + Basespecs: + - basis: CCBAR_ASYMM_FLAVOUR + Basistitle: Flavour basis + - basis: CCBAR_ASYMM + Basistitle: Evolution basis + + PDFscalespecs: + - xscale: log + Xscaletitle: Log + - xscale: linear + Xscaletitle: Linear + + Energies: + - sqrts: 13000 + Energytitle: "13 TeV" + + lumi_channels: + - gg + - gq + - qq + - qqbar + - uubar + - ddbar + - udbar + - dubar + + Distspecs: + - ymin: 0 + ymax: 20 + + pos_use_kin: True + + dataset_report: + meta: Null + template: data.md + + pdf_report: + meta: Null + template: pdf.md + + exponents_report: + meta: Null + template: exponents.md + + lumi_report: + meta: Null + template: lumi.md + + dataspecs: + - theoryid: + from_: current + pdf: + from_: current + fit: + from_: current + speclabel: + from_: current + + - theoryid: + from_: reference + pdf: + from_: reference + fit: + from_: reference + speclabel: + from_: reference + + Normalize: + normalize_to: 2 + + Datanorm: + normalize_to: data + + DataGroups: + - metadata_group: nnpdf31_process + - metadata_group: experiment + + ProcessGroup: + metadata_group: nnpdf31_process + +################################################################## +#OTHER ANALYSIS SETTINGS + +lambdavalues: + - label: "LAMBDA02" + fit: 230124_dis_ict_lam02_fs_122996 + dataset_inputs: + from_: fit + theory: + from_: fit + theoryid: + from_: theory + use_t0: True + t0pdfset: 210223-mw-000_fakepdf + use_cuts: "internal" + explained_variance_ratio: 0.99 + variancepdf: 230708_mnc_dis_pt1_ct_1000 + fits: + - 230124_dis_ict_lam02_fs_122996 + - 230124_dis_ict_lam02_fs_152326 + - 230124_dis_ict_lam02_fs_196144 + - 230124_dis_ict_lam02_fs_200988 + - 230124_dis_ict_lam02_fs_257767 + - 230124_dis_ict_lam02_fs_31144 + - 230124_dis_ict_lam02_fs_374984 + - 230124_dis_ict_lam02_fs_383440 + - 230124_dis_ict_lam02_fs_394255 + - 230124_dis_ict_lam02_fs_431574 + - 230124_dis_ict_lam02_fs_498979 + - 230124_dis_ict_lam02_fs_504344 + - 230124_dis_ict_lam02_fs_52722 + - 230124_dis_ict_lam02_fs_54647 + - 230124_dis_ict_lam02_fs_562489 + - 230124_dis_ict_lam02_fs_57171 + - 230124_dis_ict_lam02_fs_661665 + - 230124_dis_ict_lam02_fs_683404 + - 230124_dis_ict_lam02_fs_752096 + - 230124_dis_ict_lam02_fs_760929 + - 230124_dis_ict_lam02_fs_786382 + - 230124_dis_ict_lam02_fs_834248 + - 230124_dis_ict_lam02_fs_868248 + - 230124_dis_ict_lam02_fs_897760 + - 230124_dis_ict_lam02_fs_916690 + - 230124_dis_ict_lam02_fs_966390 + - label: "LAMBDA04" + fit: 250124_dis_ict_lam04_fs_102661 + dataset_inputs: + from_: fit + theory: + from_: fit + theoryid: + from_: theory + use_t0: True + t0pdfset: 210223-mw-000_fakepdf + use_cuts: "internal" + explained_variance_ratio: 0.99 + variancepdf: 230708_mnc_dis_pt1_ct_1000 + fits: + - 250124_dis_ict_lam04_fs_102661 + - 250124_dis_ict_lam04_fs_166247 + - 250124_dis_ict_lam04_fs_194869 + - 250124_dis_ict_lam04_fs_235399 + - 250124_dis_ict_lam04_fs_259799 + - 250124_dis_ict_lam04_fs_280552 + - 250124_dis_ict_lam04_fs_362871 + - 250124_dis_ict_lam04_fs_38346 + - 250124_dis_ict_lam04_fs_392751 + - 250124_dis_ict_lam04_fs_394227 + - 250124_dis_ict_lam04_fs_473619 + - 250124_dis_ict_lam04_fs_475803 + - 250124_dis_ict_lam04_fs_5238 + - 250124_dis_ict_lam04_fs_544453 + - 250124_dis_ict_lam04_fs_547187 + - 250124_dis_ict_lam04_fs_561201 + - 250124_dis_ict_lam04_fs_630491 + - 250124_dis_ict_lam04_fs_691489 + - 250124_dis_ict_lam04_fs_717426 + - 250124_dis_ict_lam04_fs_833288 + - 250124_dis_ict_lam04_fs_85029 + - 250124_dis_ict_lam04_fs_856511 + - 250124_dis_ict_lam04_fs_895365 + - 250124_dis_ict_lam04_fs_902027 + - 250124_dis_ict_lam04_fs_911964 + - 250124_dis_ict_lam04_fs_99031 + + +template: multiclosure_template.md +comparefits_report: + meta: Null + template: comparefits_template.md +single_point_report: + meta: Null + template: single_point_template.md +pca_report: + meta: Null + template: pca_template.md +actions_: + - report(main=true) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md new file mode 100644 index 0000000000..7ad062bd09 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md @@ -0,0 +1,14 @@ +{@ with compare_settings @} +[Comparefits report]({@comparefits_report report@}) +{@ endwith @} + +Single data point analysis +-------------------------- +{@with lambdavalues@} +[Lambda value of {@label@}]({@single_point_report report@}) +{@endwith@} +Bias Variance Ratio Table +------------------------- +{@with lambdavalues@} +[Lambda value of {@label@}]({@pca_report report@}) +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md new file mode 100644 index 0000000000..6269098614 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -0,0 +1,2 @@ +# PCA analysis +{@table_bias_variance_datasets@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md new file mode 100644 index 0000000000..54cdc4bd87 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md @@ -0,0 +1,24 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# PDF plots + +## PDF comparison +{@with PDFnormalize@} +### {@Normtitle@} +{@with Basespecs@} +#### {@Basistitle@} +{@with PDFscalespecs@} +##### {@Xscaletitle@} +{@plot_pdfs@} +{@endwith@} +{@endwith@} +{@endwith@} + +## PDF replicas +{@with Basespecs@} +#### {@Basistitle@} +{@with PDFscalespecs@} +##### {@Xscaletitle@} +{@plot_pdfreplicas@} +{@endwith@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md new file mode 100644 index 0000000000..8a8e5e8808 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md @@ -0,0 +1,2 @@ +# Single data point analysis +{@xq2_data_prcs_maps@} diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py new file mode 100644 index 0000000000..5d1aa7013a --- /dev/null +++ b/validphys2/src/validphys/scripts/vp_multiclosure.py @@ -0,0 +1,159 @@ +import sys +import os +import logging + +#TODO: Look into making these lazy imports +import prompt_toolkit +from prompt_toolkit.completion import WordCompleter + +from reportengine.compat import yaml +from reportengine.colors import t + +from validphys.app import App +from validphys.loader import RemoteLoader +from validphys import compareinconsistentclosuretemplates +from validphys.promptutils import confirm, KeywordsWithCache + +log = logging.getLogger(__name__) + +CURRENT_FIT_LABEL_DEFAULT = "Current Fit" +REFERENCE_FIT_LABEL_DEFAULT = "Reference Fit" + + + +class CompareFitApp(App): + def add_positional_arguments(self, parser): + parser.add_argument( + 'current_fit', + default=None, + nargs='?', + help="The fit to produce the report for.", + ) + parser.add_argument( + 'reference_fit', + default=None, + nargs='?', + help="The fit to compare with.") + # Group together mandatory arguments that are not positional + mandatory = parser.add_argument_group("mandatory", "Mandatory command line arguments") + mandatory.add_argument( + '--title', help="The title that will be indexed with the report.") + mandatory.add_argument('--author', help="The author of the report.") + mandatory.add_argument( + '--keywords', nargs='+', help="keywords to index the report with.") + parser.add_argument( + '--current_fit_label', + nargs='?', + default=CURRENT_FIT_LABEL_DEFAULT, + help="The label for the fit that the report is being produced for.", + ) + parser.add_argument( + '--reference_fit_label', + nargs='?', + default=REFERENCE_FIT_LABEL_DEFAULT, + help="The label for the fit that is being compared to.") + parser.add_argument( + '--lambdasettings_path', + help="The path to the yaml file containing the settings for the lambdavalues report.") + + parser.set_defaults() + + def try_complete_args(self): + args = self.args + argnames = ( + 'current_fit', 'reference_fit', 'title', 'author', 'keywords') + optionalnames = ( + 'current_fit_label', 'reference_fit_label') + badargs = [argname for argname in argnames if not args[argname]] + bad = badargs + if bad: + sys.exit(f"The following arguments are required: {bad}") + texts = '\n'.join( + f' {argname.replace("_", " ").capitalize()}: {args[argname]}' + for argname in [*argnames, *optionalnames]) + log.info(f"Starting NNPDF fit comparison:\n{texts}") + + def get_commandline_arguments(self, cmdline=None): + args = super().get_commandline_arguments(cmdline) + # This is needed because the environment wants to know how to resolve + # the relative paths to find the templates. Best to have the template + # look as much as possible as a runcard passed from the command line + args['config_yml'] = compareinconsistentclosuretemplates.template_path + return args + + def complete_mapping(self): + args = self.args + autosettings = {} + shortsettings = {} + autosettings['meta'] = { + 'title': args['title'], + 'author': args['author'], + 'keywords': args['keywords'] + } + currentmap = {'id': args['current_fit'], 'label': args['current_fit_label']} + shortsettings['current'] = { + 'fit': currentmap, + 'pdf': currentmap, + 'theory': { + 'from_': 'fit' + }, + 'theoryid': { + 'from_': 'theory' + }, + 'speclabel': args['current_fit_label'] + } + refmap = {'id': args['reference_fit'], 'label': args['reference_fit_label']} + shortsettings['reference'] = { + 'fit': refmap, + 'pdf': refmap, + 'theory': { + 'from_': 'fit' + }, + 'theoryid': { + 'from_': 'theory' + }, + 'speclabel': args['reference_fit_label'] + } + autosettings["compare_settings"] = shortsettings + return autosettings + + def complete_lambdavaluesmapping(self): + args = self.args + # opening conf file + with open(args['lambdasettings_path']) as f: + settings = yaml.safe_load(f) + autosettings = {} + list_lambdas = [] + for lambdas in settings: + lambdasetting = {} + for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + lambdasetting[set] = lambdas[set] + list_lambdas.append(lambdasetting) + autosettings["lambdavalues"] = list_lambdas + return autosettings + + def get_config(self): + self.try_complete_args() + #No error handling here because this is our internal file + with open(self.args['config_yml']) as f: + #TODO: Ideally this would load round trip but needs + #to be fixed in reportengine. + c = yaml.safe_load(f) + complete_mapping = self.complete_mapping() + complete_mapping.update(self.complete_lambdavaluesmapping()) + c["meta"] = complete_mapping["meta"] + for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]): + for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + lambdas[set] = lambdas_mapping[set] + for set in ["current", "reference"]: + c["compare_settings"][set] = complete_mapping["compare_settings"][set] + return self.config_class(c, environment=self.environment) + + +def main(): + a = CompareFitApp() + a.main() + + +if __name__ == '__main__': + main() From 133747610dbde87088e3b31d18d84fb4f9007ee7 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Mon, 26 Feb 2024 10:48:48 +0100 Subject: [PATCH 11/14] Add sklearn to pyproject --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d1f148e07b..0562e82f22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,7 @@ fiatlux = {version = "*", optional = true} # without lhapdf pdfflow = {version = "^1.2.1", optional = true} lhapdf-management = {version = "^0.5", optional = true} +scikit-learn = "^1.4.1" # Optional dependencies [tool.poetry.extras] From dfa13a2934d08677270a9c76403902fa8adcae9e Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Mon, 26 Feb 2024 11:33:48 +0100 Subject: [PATCH 12/14] Scikit added to conda --- conda-recipe/meta.yaml | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index a13244afa0..318febc365 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -59,6 +59,7 @@ requirements: - fiatlux - frozendict # needed for caching of data loading - curio >=1.0 # reportengine uses it but it's not in its dependencies + - scikit-learn = "^1.4.1" test: requires: diff --git a/pyproject.toml b/pyproject.toml index 0562e82f22..99904644c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ pandas = "<2" numpy = "*" validobj = "*" prompt_toolkit = "*" +scikit-learn = "^1.4.1" frozendict = "*" # validphys: needed for caching of data loading # Reportengine (and its dependencies) need to be installed in a bit more manual way reportengine = { git = "https://github.com/NNPDF/reportengine", rev = "3bb2b1d"} @@ -97,7 +98,6 @@ fiatlux = {version = "*", optional = true} # without lhapdf pdfflow = {version = "^1.2.1", optional = true} lhapdf-management = {version = "^0.5", optional = true} -scikit-learn = "^1.4.1" # Optional dependencies [tool.poetry.extras] From 7b1183fc6c5b53edb56f10bee05d3c8baf7be088 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Mon, 26 Feb 2024 11:34:31 +0100 Subject: [PATCH 13/14] Correct conda recipe --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 318febc365..e07586bf79 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -59,7 +59,7 @@ requirements: - fiatlux - frozendict # needed for caching of data loading - curio >=1.0 # reportengine uses it but it's not in its dependencies - - scikit-learn = "^1.4.1" + - scikit-learn >= 1.4.1 test: requires: From a4eabd8d776464cabce4ce3ed0635675a57c65f5 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Mon, 26 Feb 2024 11:37:30 +0100 Subject: [PATCH 14/14] Correct conda recipe --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index e07586bf79..bac4fabb10 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -59,7 +59,7 @@ requirements: - fiatlux - frozendict # needed for caching of data loading - curio >=1.0 # reportengine uses it but it's not in its dependencies - - scikit-learn >= 1.4.1 + - scikit-learn >=1.4.1 test: requires: