"""
closuretest/multiclosure.py
Module containing all of the statistical estimators which are
averaged across multiple fits or a single replica proxy fit. The actions
in this module are used to produce results which are plotted in
``multiclosure_output.py``
"""
import numpy as np
import dataclasses
from reportengine import collect
import validphys
from validphys.calcutils import calc_chi2
from validphys.checks import check_use_t0
from validphys.closuretest.closure_checks import (
check_fits_areclosures,
check_fits_different_filterseed,
check_fits_underlying_law_match,
check_multifit_replicas,
check_t0pdfset_matches_multiclosure_law,
)
from validphys.results import ThPredictionsResult
[docs]@dataclasses.dataclass
class MulticlosureLoader:
"""
Stores the basic information for a multiclosure study.
Attributes
----------
closure_theories: list
List of validphys.results.ThPredictionsResult objects for each fit.
law_theory: validphys.results.ThPredictionsResult
ThPredictionsResult object for the underlying law.
covmat_reps_mean: np.array
Covariance matrix of the theory predictions averaged over fits.
"""
closure_theories: list
law_theory: ThPredictionsResult
covmat_reps_mean: np.array
[docs]def mean_covmat_multiclosure(closure_theories: list) -> np.array:
"""
Computes the 'PDF' covariance matrices obtained from each multiclosure
fit and averages over them.
Parameters
----------
closure_theories: list
list of ThPredictionsResult
Returns
-------
covmat_reps_mean
np.array
"""
reps = np.asarray([th.error_members for th in closure_theories])
# compute the covariance matrix of the theory predictions for each fit
_covmats = np.array([np.cov(rep, rowvar=True, bias=True) for rep in reps])
# compute the mean covariance matrix
covmat_reps_mean = np.mean(_covmats, axis=0)
return covmat_reps_mean
[docs]@check_fits_underlying_law_match
@check_fits_areclosures
@check_fits_different_filterseed
@check_use_t0
@check_t0pdfset_matches_multiclosure_law
def multiclosure_dataset_loader(
dataset: validphys.core.DataSetSpec,
fits_pdf: list,
multiclosure_underlyinglaw: validphys.core.PDF,
t0set: validphys.core.PDF,
) -> MulticlosureLoader:
"""
Internal function for loading multiple theory predictions and underlying law
for a given dataset. This function is used to avoid memory issues when
caching the load function of a group of datasets.
Parameters
----------
dataset: (DataSetSpec, DataGroupSpec)
dataset for which the theory predictions and t0 covariance matrix
will be loaded. Note that due to the structure of `validphys` this
function can be overloaded to accept a DataGroupSpec.
fits_pdf: list
list of PDF objects produced from performing multiple closure tests
fits. Each fit should have a different filterseed but the same
underlying law used to generate the pseudodata.
multiclosure_underlyinglaw: PDF
PDF used to generate the pseudodata which the closure tests fitted. This
is inferred from the fit runcards.
t0set: validphys.core.PDF
t0 pdfset, is only used to check that the underlying law matches the t0set.
Returns
-------
MulticlosureLoader
A dataclass storing the theory predictions for the fits and the underlying law.
Notes
-----
This function replicates behaviour found elsewhere in validphys, the reason
for this is that due to the default caching behaviour one can run into
memory issues when loading the theory predictions for the amount of fits
typically used in these studies.
"""
closure_theories = [ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf]
law_theory = ThPredictionsResult.from_convolution(multiclosure_underlyinglaw, dataset)
# compute the mean covariance matrix
covmat_reps_mean = mean_covmat_multiclosure(closure_theories)
return MulticlosureLoader(
closure_theories=closure_theories, law_theory=law_theory, covmat_reps_mean=covmat_reps_mean
)
[docs]@check_fits_underlying_law_match
@check_fits_areclosures
@check_fits_different_filterseed
@check_t0pdfset_matches_multiclosure_law
@check_use_t0
def multiclosure_data_loader(
data: validphys.core.DataGroupSpec,
fits_pdf: list,
multiclosure_underlyinglaw: validphys.core.PDF,
t0set: validphys.core.PDF,
) -> MulticlosureLoader:
"""Like `multiclosure_dataset_loader` except for all data"""
return multiclosure_dataset_loader(data, fits_pdf, multiclosure_underlyinglaw, t0set)
[docs]def eigendecomposition(covmat: np.array) -> tuple:
"""
Computes the eigendecomposition of a covariance matrix
and returns the eigenvalues, eigenvectors and the normalized
eigenvalues ordered from largest to smallest.
Parameters
----------
covmat: np.array
covariance matrix
Returns
-------
tuple
3D tuple containing the eigenvalues, eigenvectors and the normalized
eigenvalues.
Note that the eigenvalues are sorted from largest to smallest.
"""
eighvals, eigvecs = np.linalg.eigh(covmat)
idx = np.argsort(eighvals)[::-1]
# sort eigenvalues from largest to smallest
eigvecs = eigvecs[:, idx]
eighvals = eighvals[idx]
eighvals_norm = eighvals / eighvals.sum()
return eighvals, eigvecs, eighvals_norm
[docs]@dataclasses.dataclass
class RegularizedMulticlosureLoader(MulticlosureLoader):
"""
Attributes
----------
pc_basis: np.array
Basis of principal components.
n_comp: int
Number of principal components kept after regularisation.
reg_covmat_reps_mean: np.array
Diagonal, regularised covariance matrix computed from replicas
of theory predictions.
sqrt_reg_covmat_reps_mean: np.array
Sqrt of the regularised covariance matrix.
std_covmat_reps: np.array
Square root of diagonal entries of the original covariance matrix.
"""
pc_basis: np.array
n_comp: int
reg_covmat_reps_mean: np.array
sqrt_reg_covmat_reps_mean: np.array
std_covmat_reps: np.array
[docs]@check_multifit_replicas
def regularized_multiclosure_dataset_loader(
multiclosure_dataset_loader: MulticlosureLoader,
explained_variance_ratio=0.95,
_internal_max_reps=None,
_internal_min_reps=20,
) -> RegularizedMulticlosureLoader:
"""
Similar to multiclosure.multiclosure_dataset_loader but computes the regularized
PDF covariance matrix by only keeping the largest eigenvalues that sum to the
`explained_variance_ratio`.
Parameters
----------
multiclosure_dataset_loader: MulticlosureLoader
explained_variance_ratio: float, default is 0.95
_internal_max_reps: int, default is None
Maximum number of replicas used in the fits
this is needed to check that the number of replicas is the same for all fits
_internal_min_reps: int, default is 20
Minimum number of replicas used in the fits
this is needed to check that the number of replicas is the same for all fits
Returns
-------
RegularizedMulticlosureLoader
"""
closures_th = multiclosure_dataset_loader.closure_theories
law_th = multiclosure_dataset_loader.law_theory
covmat_reps_mean = multiclosure_dataset_loader.covmat_reps_mean
if covmat_reps_mean.shape == ():
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=1,
n_comp=1,
reg_covmat_reps_mean=covmat_reps_mean,
sqrt_reg_covmat_reps_mean=np.sqrt(covmat_reps_mean),
std_covmat_reps=np.sqrt(covmat_reps_mean),
)
# diagonalize the mean covariance matrix and only keep the principal components
# that explain the required variance
eighvals, eigvecs, eighvals_norm = eigendecomposition(covmat_reps_mean)
# choose components to keep based on EVR
n_comp = 1
for _ in range(eighvals.shape[0]):
if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio:
break
n_comp += 1
# get the principal components
pc_basis = eigvecs[:, :n_comp]
# Diagonalise and project the mean covmat in the space spanned by the PCs
reg_covmat_reps_mean = pc_basis.T @ covmat_reps_mean @ pc_basis
if n_comp == 1:
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=pc_basis,
n_comp=1,
reg_covmat_reps_mean=reg_covmat_reps_mean,
sqrt_reg_covmat_reps_mean=np.sqrt(reg_covmat_reps_mean),
std_covmat_reps=np.sqrt(np.diag(covmat_reps_mean)),
)
# compute sqrt of pdf covariance matrix (NOTE: the matrix should be diagonal)
sqrt_reg_covmat_reps_mean = np.diag(np.sqrt(np.diag(reg_covmat_reps_mean)))
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=pc_basis,
n_comp=n_comp,
reg_covmat_reps_mean=reg_covmat_reps_mean,
sqrt_reg_covmat_reps_mean=sqrt_reg_covmat_reps_mean,
std_covmat_reps=np.sqrt(np.diag(covmat_reps_mean)),
)
[docs]@check_multifit_replicas
def regularized_multiclosure_data_loader(
multiclosure_data_loader: MulticlosureLoader,
explained_variance_ratio=0.95,
_internal_max_reps=None,
_internal_min_reps=20,
):
"""
Similar to multiclosure.regularized_multiclosure_dataset_loader except for all data.
In this case we regularize the correlation matrix rather than the covariance matrix,
the reason for this is that different experiments can have different units.
Parameters
----------
multiclosure_data_loader: MulticlosureLoader
explained_variance_ratio: float, default is 0.95
_internal_max_reps: int, default is None
Maximum number of replicas used in the fits
this is needed to check that the number of replicas is the same for all fits
_internal_min_reps: int, default is 20
Minimum number of replicas used in the fits
this is needed to check that the number of replicas is the same for all fits
Returns
-------
RegularizedMulticlosureLoader
"""
closures_th = multiclosure_data_loader.closure_theories
law_th = multiclosure_data_loader.law_theory
covmat_reps_mean = multiclosure_data_loader.covmat_reps_mean
# Keep the sqrt of the diagonals to reconstruct the covmat later
D = np.sqrt(np.diag(covmat_reps_mean))
# compute the correlation matrix
_corrmat_mean = covmat_reps_mean / np.outer(D, D)
# diagonalize the mean correlation matrix and only keep the principal components
# that explain the required variance
if covmat_reps_mean.shape == ():
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=1,
n_comp=1,
reg_covmat_reps_mean=covmat_reps_mean,
sqrt_reg_covmat_reps_mean=np.sqrt(covmat_reps_mean),
std_covmat_reps=D,
)
eighvals, eigvecs, eighvals_norm = eigendecomposition(_corrmat_mean)
# choose components to keep based on EVR
n_comp = 1
for _ in range(eighvals.shape[0]):
if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio:
break
n_comp += 1
# get the principal components
pc_basis = eigvecs[:, :n_comp]
# compute the (PCA) regularized covariance matrix by projecting the mean covariance matrix
# onto the principal components
reg_corrmat_reps_mean = pc_basis.T @ _corrmat_mean @ pc_basis
if n_comp == 1:
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=pc_basis,
n_comp=1,
reg_covmat_reps_mean=reg_corrmat_reps_mean,
sqrt_reg_covmat_reps_mean=np.sqrt(reg_corrmat_reps_mean),
std_covmat_reps=D,
)
# compute sqrt of pdf correlation matrix (NOTE: the matrix should be diagonal)
sqrt_reg_corrmat_reps_mean = np.diag(np.sqrt(np.diag(reg_corrmat_reps_mean)))
return RegularizedMulticlosureLoader(
closure_theories=closures_th,
law_theory=law_th,
covmat_reps_mean=covmat_reps_mean,
pc_basis=pc_basis,
n_comp=n_comp,
reg_covmat_reps_mean=reg_corrmat_reps_mean,
sqrt_reg_covmat_reps_mean=sqrt_reg_corrmat_reps_mean,
std_covmat_reps=D,
)
[docs]def compute_normalized_bias(
regularized_multiclosure_loader: RegularizedMulticlosureLoader, corrmat: bool = False
) -> np.array:
"""
Compute the normalized bias for a RegularizedMulticlosureLoader object.
If corrmat is True, the bias is computed assuming that RegularizedMulticlosureLoader
contains the correlation matrix, this is needed when computing the bias for the entire data.
Parameters
----------
regularized_multiclosure_loader: RegularizedMulticlosureLoader
corrmat: bool, default is False
Returns
-------
np.array
Array of shape len(fits) containing the normalized bias for each fit.
"""
closure_theories = regularized_multiclosure_loader.closure_theories
law_theory = regularized_multiclosure_loader.law_theory
n_comp = regularized_multiclosure_loader.n_comp
pc_basis = regularized_multiclosure_loader.pc_basis
sqrt_reg_covmat_reps_mean = regularized_multiclosure_loader.sqrt_reg_covmat_reps_mean
std_covmat_reps = regularized_multiclosure_loader.std_covmat_reps
reps = np.asarray([th.error_members for th in closure_theories])
# compute bias diff and project it onto space spanned by PCs
delta_bias = reps.mean(axis=2).T - law_theory.central_value[:, np.newaxis]
if n_comp == 1:
delta_bias = pc_basis * delta_bias
if corrmat:
delta_bias /= std_covmat_reps
biases = (delta_bias / sqrt_reg_covmat_reps_mean) ** 2
else:
if corrmat:
delta_bias = pc_basis.T @ (delta_bias.T / std_covmat_reps).T
else:
delta_bias = pc_basis.T @ delta_bias
biases = calc_chi2(sqrt_reg_covmat_reps_mean, delta_bias)
return biases / n_comp
[docs]def bias_dataset(regularized_multiclosure_dataset_loader):
"""
Computes the normalized bias for a RegularizedMulticlosureLoader object
for a single dataset.
Parameters
----------
regularized_multiclosure_dataset_loader : RegularizedMulticlosureLoader
Returns
-------
tuple
bias_fits
n_comp
"""
bias_fits = compute_normalized_bias(regularized_multiclosure_dataset_loader, corrmat=False)
n_comp = regularized_multiclosure_dataset_loader.n_comp
return bias_fits, n_comp
"""
Collects the bias data for all datasets.
"""
bias_datasets = collect("bias_dataset", ("data",))
[docs]def bias_data(regularized_multiclosure_data_loader):
"""
Similar to `bias_dataset` but for all data.
"""
bias_fits = compute_normalized_bias(regularized_multiclosure_data_loader, corrmat=True)
n_comp = regularized_multiclosure_data_loader.n_comp
return bias_fits, n_comp
[docs]def normalized_delta_bias_data(
regularized_multiclosure_data_loader: RegularizedMulticlosureLoader,
) -> tuple:
"""
Compute for all data only the normalized delta after PCA regularization.
Parameters
----------
regularized_multiclosure_data_loader : tuple
Tuple containing the results of multiclosure fits after pca regularization
Returns
-------
tuple
deltas
n_comp
"""
# NOTE: function computes delta assuming RegularizedMulticlosureLoader
# contains the regularized / diagonal correlation matrix.
pca_loader = regularized_multiclosure_data_loader
closures_th = pca_loader.closure_theories
law_th = pca_loader.law_theory
reg_covmat_reps_mean = pca_loader.reg_covmat_reps_mean
sqrt_reg_covmat_reps_mean = pca_loader.sqrt_reg_covmat_reps_mean
pc_basis = pca_loader.pc_basis
std_covmat_reps = pca_loader.std_covmat_reps
n_comp = pca_loader.n_comp
reps = np.asarray([th.error_members for th in closures_th])
# compute bias diff and project it onto space spanned by PCs
delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
if n_comp == 1:
# For full data we regularize the correlation matrix
delta_bias = pc_basis.T @ (delta_bias / std_covmat_reps)
std_deviations = sqrt_reg_covmat_reps_mean
else:
# delta_bias = eigenvects.T @ (pc_basis.T @ delta_bias)
# For full data we regularize the correlation matrix
delta_bias = pc_basis.T @ (delta_bias.T / std_covmat_reps).T
# reg_covmat_reps_mean should be diagonal
std_deviations = np.sqrt(np.diag(reg_covmat_reps_mean))[:, None]
return (delta_bias / std_deviations).flatten(), n_comp
"""
TODO: the code below should be revised by GDC
"""
[docs]@check_multifit_replicas
def fits_normed_dataset_central_delta(
multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""
For each fit calculate the difference between central expectation value and true val. Normalize this
value by the variance of the differences between replicas and central expectation value (different
for each fit but expected to vary only a little). Each observable central exp value is
expected to be gaussianly distributed around the true value set by the fakepdf.
Parameters
----------
multiclosure_dataset_loader: tuple
closure fits theory predictions,
underlying law theory predictions,
covariance matrix,
sqrt covariance matrix
_internal_max_reps: int
maximum number of replicas to use for each fit
_internal_min_reps: int
minimum number of replicas to use for each fit
Returns
-------
deltas: np.array
2-D array with shape (n_fits, n_obs)
"""
closures_th = multiclosure_dataset_loader.closure_theories
law_th = multiclosure_dataset_loader.law_theory
# The dimentions here are (fit, data point, replica)
reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
avg_standard_deviation = np.sqrt(np.mean(np.var(reps,axis=2),axis=0))
delta = reps.mean(axis=2)-law_th.central_value[np.newaxis,:]
delta = delta/avg_standard_deviation[np.newaxis,:]
return delta
[docs]def xq2_dataset_map(
xq2map_with_cuts, multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""
For a single dataset and a set of fits define a dictionary which contains for each datapoint of the dataset the following information:
- x coordinate
- Q**2 coordinate
- value of Ratio bias-variance at that point for the given fits
- value of xi at that point for the given fits
for double
Parameters
----------
xq2map_with_cuts: validphys.kinematics.XQ2Map
contains kinematic information of dataset's datapoints
multiclosure_dataset_loader: tuple
closure fits theory predictions,
underlying law theory predictions,
covariance matrix,
sqrt covariance matrix
_internal_max_reps: int
maximum number of replicas to use for each fit
_internal_min_reps: int
minimum number of replicas to use for each fit
Returns
-------
xq2map: dictionary
dictionary containing:
- x coordinate
- Q**2 coordinate
- Ratio bias-variance
- xi
"""
commondata = xq2map_with_cuts.commondata
coords = xq2map_with_cuts[2]
deltas = fits_normed_dataset_central_delta(multiclosure_dataset_loader)
R_b = np.sqrt(np.mean(deltas**2, axis=0))
xi = np.sum((np.abs(deltas) < 1).reshape(np.shape(deltas)[0],np.shape(deltas)[1]),axis=0)/np.shape(deltas)[0]
# for the case of double-hadronic observables we have 2 (x,Q) for each experimental point
if coords[0].shape[0] != R_b.shape[0]:
R_b = np.concatenate((R_b, R_b))
xi = np.concatenate((xi, xi))
xq2map = {
'x_coords': coords[0],
'Q_coords': coords[1],
'R_b': R_b,
'xi': xi,
'name': commondata.name,
'process': commondata.process_type,
}
return xq2map
xq2_data_map = collect("xq2_dataset_map", ("data",))