Source code for validphys.closuretest.multiclosure_bootstrap

"""
Module for bootstrapping multiclosure fits.

"""

import numpy as np
import pandas as pd

from reportengine import collect

import copy
from validphys.closuretest.multiclosure import (
    MulticlosureLoader,
    mean_covmat_multiclosure,
    bias_dataset,
    bias_data,
    regularized_multiclosure_dataset_loader,
    regularized_multiclosure_data_loader,
    normalized_delta_bias_data,
)


[docs]class BootstrappedTheoryResult: """ Proxy class which mimics results.ThPredictionsResult so that pre-existing bias/variance actions can be used with bootstrapped replicas """ def __init__(self, data): self.error_members = data self.central_value = data.mean(axis=1) self.rawdata = np.concatenate([self.central_value.reshape(-1, 1), data], axis=-1)
def _bootstrap_multiclosure_fits( multiclosure_loader: MulticlosureLoader, rng: np.random.RandomState, n_fit_max: int, n_fit: int, n_rep_max: int, n_rep: int, use_repeats: bool, ): """ Perform a single bootstrap resample of the multiclosure fits and return a proxy of the base internal object used by relevant estimator actions with the fits and replicas resampled. If use_repeats is False then each fit and replica can only be chosen once and there are no repeated samples of either fit or replicas within each fit. The various n_fit* and n_rep* choices are for finite size effect studies. If you want to perform a simple bootstrap then simply set n_fit and n_fit_max to the number of closure fits (len(fits)) and n_rep and n_rep_max to the number of replicas in each of the closure tests. Returns ------- resampled_multiclosure_loader: MulticlosureLoader with bootstrapped closure theories. """ closure_th = multiclosure_loader.closure_theories fit_boot_index = rng.choice(n_fit_max, size=n_fit, replace=use_repeats) fit_boot_th = [closure_th[i] for i in fit_boot_index] boot_ths = [] # construct proxy fits theory predictions for fit_th in fit_boot_th: rep_boot_index = rng.choice(n_rep_max, size=n_rep, replace=use_repeats) boot_ths.append(BootstrappedTheoryResult(fit_th.error_members[:, rep_boot_index])) resampled_multiclosure_loader = copy.deepcopy(multiclosure_loader) # replace closure_theories with bootstrapped theories resampled_multiclosure_loader.closure_theories = boot_ths # Replace the original replica covmat with the new bootstrapped covmat covmat_reps_mean = mean_covmat_multiclosure(boot_ths) resampled_multiclosure_loader.covmat_reps_mean = covmat_reps_mean return resampled_multiclosure_loader
[docs]def bootstrapped_multiclosure_dataset_loader( multiclosure_dataset_loader: MulticlosureLoader, n_fit_max: int, n_fit: int, n_rep_max: int, n_rep: int, n_boot_multiclosure: int, use_repeats: bool = True, ): """ Returns a tuple of MulticlosureLoader objects each of which is a bootstrap resample of the original dataset. Parameters ---------- multiclosure_dataset_loader: MulticlosureLoader n_fit_max: int maximum number of fits, should be smaller or equal to number of multiclosure fits n_fit: int number of fits to draw for each resample n_rep_max: int maximum number of replicas, should be smaller or equal to number of replicas in each fit n_rep: int number of replicas to draw for each resample n_boot_multiclosure: int number of bootstrap resamples to perform rng_seed_mct_boot: int seed for random number generator use_repeats: bool, default is True whether to allow repeated fits and replicas in each resample Returns ------- resampled_multiclosure: tuple of shape (n_boot_multiclosure,) tuple of MulticlosureLoader objects each of which is a bootstrap resample of the original dataset """ return tuple( [ _bootstrap_multiclosure_fits( multiclosure_dataset_loader, rng=np.random.RandomState(seed=i), n_fit_max=n_fit_max, n_fit=n_fit, n_rep_max=n_rep_max, n_rep=n_rep, use_repeats=use_repeats, ) for i in range(n_boot_multiclosure) ] )
[docs]def bootstrapped_multiclosure_data_loader( multiclosure_data_loader: MulticlosureLoader, n_fit_max: int, n_fit: int, n_rep_max: int, n_rep: int, n_boot_multiclosure: int, use_repeats: bool = True, ): """Like `bootstrapped_multiclosure_dataset_loader` except for all data.""" return bootstrapped_multiclosure_dataset_loader( multiclosure_data_loader, n_fit_max, n_fit, n_rep_max, n_rep, n_boot_multiclosure, use_repeats, )
[docs]def bootstrapped_regularized_multiclosure_dataset_loader( multiclosure_dataset_loader: MulticlosureLoader, n_fit_max: int, n_fit: int, n_rep_max: int, n_rep: int, n_boot_multiclosure: int, use_repeats: bool = True, explained_variance_ratio: float = 0.95, _internal_max_reps=None, _internal_min_reps=20, ) -> tuple: """ Similar to multiclosure.bootstrapped_multiclosure_dataset_loader but returns PCA regularised covariance matrix, where the covariance matrix has been computed from the replicas of the theory predictions. Returns a tuple of RegularizedMulticlosureLoader objects. """ # get bootstrapped multiclosure dataset loader bootstrap_mdl = bootstrapped_multiclosure_dataset_loader( multiclosure_dataset_loader, n_fit_max=n_fit_max, n_fit=n_fit, n_rep_max=n_rep_max, n_rep=n_rep, n_boot_multiclosure=n_boot_multiclosure, use_repeats=use_repeats, ) # PCA regularise all the bootstrapped internal multiclosure dataset loaders bootstrap_mdl_pca = [ regularized_multiclosure_dataset_loader( mdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps ) for mdl in bootstrap_mdl ] return tuple(bootstrap_mdl_pca)
[docs]def bootstrapped_regularized_multiclosure_data_loader( multiclosure_data_loader: MulticlosureLoader, n_fit_max: int, n_fit: int, n_rep_max: int, n_rep: int, n_boot_multiclosure: int, use_repeats: bool = True, explained_variance_ratio: float = 0.95, _internal_max_reps=None, _internal_min_reps=20, ) -> tuple: """ Same as `bootstrapped_regularized_multiclosure_dataset_loader` but for all the data. """ # get bootstrapped internal multiclosure dataset loader bootstrap_mdl = bootstrapped_multiclosure_data_loader( multiclosure_data_loader, n_fit_max=n_fit_max, n_fit=n_fit, n_rep_max=n_rep_max, n_rep=n_rep, n_boot_multiclosure=n_boot_multiclosure, use_repeats=use_repeats, ) # PCA regularise all the bootstrapped internal multiclosure dataset loaders bootstrap_mdl_pca = [ regularized_multiclosure_data_loader( mdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps ) for mdl in bootstrap_mdl ] return tuple(bootstrap_mdl_pca)
[docs]def bootstrapped_bias_dataset(bootstrapped_regularized_multiclosure_dataset_loader, dataset): """ Computes Bias for each bootstrap sample. Returns a DataFrame with the results. """ boot_bias_var_samples = [] for i, boot_mdl_pca in enumerate(bootstrapped_regularized_multiclosure_dataset_loader): bias, n_comp = bias_dataset(boot_mdl_pca) boot_bias_var_samples.append( {"bias": np.mean(bias), "n_comp": n_comp, "dataset": str(dataset), "bootstrap_index": i} ) df = pd.DataFrame.from_records( boot_bias_var_samples, index="bootstrap_index", columns=("bootstrap_index", "dataset", "n_comp", "bias"), ) df.columns = ["dataset", "n_comp", "bias"] return df
""" Collect `bootstrapped_bias_dataset` over all datasets. """ bootstrapped_bias_datasets = collect("bootstrapped_bias_dataset", ("data",))
[docs]def bootstrapped_bias_data(bootstrapped_regularized_multiclosure_data_loader): """ Computes Bias and Variance for each bootstrap sample. Returns a DataFrame with the results. """ boot_bias_var_samples = [] for i, boot_mdl_pca in enumerate(bootstrapped_regularized_multiclosure_data_loader): bias, n_comp = bias_data(boot_mdl_pca) boot_bias_var_samples.append( { "bias": np.mean(bias), "err bias": np.std(bias), "n_comp": n_comp, "data": "Full dataset", "bootstrap_index": i, } ) df = pd.DataFrame.from_records( boot_bias_var_samples, index="bootstrap_index", columns=("bootstrap_index", "dataset", "n_comp", "bias", "err bias"), ) df.columns = ["dataset", "n_comp", "bias", "err bias"] return df
[docs]def bootstrapped_normalized_delta_bias_data(bootstrapped_regularized_multiclosure_data_loader): """ Compute the normalized deltas for each bootstrap sample. Note: delta is the bias in the diagonal basis. Parameters ---------- bootstrapped_regularized_multiclosure_data_loader: list list of `RegularizedMulticlosureLoader` objects. Returns ------- list """ normalised_deltas = [] for boot_mdl_pca in bootstrapped_regularized_multiclosure_data_loader: normalised_deltas.append(normalized_delta_bias_data(boot_mdl_pca)) return normalised_deltas
[docs]def standard_indicator_function(standard_variable, nsigma=1): """ Calculate the indicator function for a standardised variable. Parameters ---------- standard_variable: np.array array of variables that have been standardised: (x - mu)/sigma nsigma: float number of standard deviations to consider Returns ------- np.array array of ones and zeros. If 1 then the variable is within nsigma standard deviations from the mean, otherwise it is 0. """ return np.array(abs(standard_variable) < nsigma, dtype=int)
[docs]def bootstrapped_indicator_function_data(bootstrapped_normalized_delta_bias_data, nsigma=1): """ Compute the indicator function for each bootstrap sample. Parameters ---------- bootstrapped_normalized_delta_bias_data: list list containing the normalized deltas and the number of principal components. nsigma: int, default is 1 Returns ------- 2-D tuple: list list of length N_boot and entrances are arrays of dim Npca x Nfits containing the indicator function for each bootstrap sample. float average number of degrees of freedom """ indicator_list = [] ndof_list = [] for boot, ndof in bootstrapped_normalized_delta_bias_data: indicator_list.append(standard_indicator_function(boot, nsigma)) ndof_list.append(ndof) return indicator_list, np.mean(np.asarray(ndof_list))