Source code for validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent_output

"""
multiclosure_inconsistent_output

Module containing the actions which produce some output in validphys
reports i.e figures or tables for (inconsistent) multiclosure 
estimators in the space of data

"""

import pandas as pd
import numpy as np

from reportengine.figure import figuregen, figure
from reportengine.table import table
from reportengine import collect
from scipy.stats import norm

from validphys import plotutils
from validphys.closuretest.multiclosure import internal_multiclosure_dataset_loader_pca


[docs] @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ Compute the bias, variance, ratio and sqrt(ratio) for each dataset and return a DataFrame with the results. Parameters ---------- principal_components_bias_variance_datasets: list List of tuples containing the values of bias, variance and number of degrees of freedom each_dataset: list List of validphys.core.DataSetSpec Returns ------- pd.DataFrame DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset bias = np.mean(biases) variance = np.mean(variances) rbv = bias / variance # use gaussian uncertainty propagation delta_rbv = np.sqrt( ((1 / variance) * np.std(biases)) ** 2 + (bias / variance**2 * np.std(variances)) ** 2 ) sqrt_rbv = np.sqrt(bias / variance) delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv) records.append( { "dataset": str(ds), "dof": n_comp, "bias": bias, "variance": variance, "ratio": rbv, "error ratio": delta_rbv, "sqrt(ratio)": sqrt_rbv, "error sqrt(ratio)": delta_sqrt_rbv, } ) df = pd.DataFrame.from_records( records, index="dataset", columns=( "dataset", "dof", "bias", "variance", "ratio", "error ratio", "sqrt(ratio)", "error sqrt(ratio)", ), ) return df
[docs] @table def table_bias_variance_data(principal_components_bias_variance_data): """ Same as table_bias_variance_datasets but for all the data, meaning that the correlations between the datasets are taken into account. Parameters ---------- principal_components_bias_variance_data: list Same of principal_components_bias_variance_datasets but for all the data each_dataset: list List of validphys.core.DataSetSpec Returns ------- pd.DataFrame DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] # First let's do the total biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data bias_tot = np.mean(biases_tot) variance_tot = np.mean(variances_tot) rbv_tot = bias_tot / variance_tot # use gaussian uncertainty propagation delta_rbv_tot = np.sqrt( ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 ) sqrt_rbv_tot = np.sqrt(rbv_tot) delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot) records.append( { "dataset": "Total", "dof": n_comp_tot, "bias": bias_tot, "variance": variance_tot, "ratio": rbv_tot, "error ratio": delta_rbv_tot, "sqrt(ratio)": sqrt_rbv_tot, "error sqrt(ratio)": delta_sqrt_rbv_tot, } ) df = pd.DataFrame.from_records( records, index="dataset", columns=( "dataset", "dof", "bias", "variance", "ratio", "error ratio", "sqrt(ratio)", "error sqrt(ratio)", ), ) return df
lambdavalues_table_bias_variance_datasets = collect( "table_bias_variance_datasets", ("lambdavalues",) )
[docs] @table def bootstrapped_table_bias_variance_datasets( bootstrapped_principal_components_bias_variance_datasets, ): """ Compute the bias, variance, ratio and sqrt(ratio) for each dataset and return a DataFrame with the results. Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation of the bootstrap uncertainty on bias and variance. """ records = [] for boot_ds in bootstrapped_principal_components_bias_variance_datasets: df = boot_ds mean_bias = df["bias"].mean() mean_variance = df["variance"].mean() mean_ratio = mean_bias / mean_variance # gaussian error propagation for the ratio of the means uncertainty # only consider bias as source of uncertainty for the ratio (variance is almost constant) bootstrap_unc_ratio = np.std(df["bias"] / df["variance"]) sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"])) # gaussian error propagation for the sqrt of the ratio bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"])) records.append( { "dataset": df["dataset"].iloc[0], "mean_dof": df.n_comp.mean(), "bias": mean_bias, "variance": mean_variance, "ratio": mean_ratio, "error ratio": bootstrap_unc_ratio, "sqrt(ratio)": sqrt_ratio, "error sqrt(ratio)": bootstrap_unc_sqrt_ratio, } ) df = pd.DataFrame.from_records( records, index="dataset", columns=( "dataset", "mean_dof", "bias", "variance", "ratio", "error ratio", "sqrt(ratio)", "error sqrt(ratio)", ), ) return df
lambdavalues_bootstrapped_table_bias_variance_datasets = collect( "bootstrapped_table_bias_variance_datasets", ("lambdavalues",) )
[docs] @table def bootstrapped_table_bias_variance_data(bootstrapped_principal_components_bias_variance_data): """ Compute the bias, variance, ratio and sqrt(ratio) for a Datagroup and return a DataFrame with the results. Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation of the bootstrap uncertainty on bias and variance. """ records = [] df = bootstrapped_principal_components_bias_variance_data mean_bias = df["bias"].mean() mean_variance = df["variance"].mean() mean_ratio = mean_bias / mean_variance # gaussian error propagation for the ratio of the means uncertainty # only consider bias as source of uncertainty for the ratio (variance is almost constant) bootstrap_unc_ratio = np.std(df["bias"] / df["variance"]) sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"])) # gaussian error propagation for the sqrt of the ratio bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"])) records.append( { "dataset": df["dataset"].iloc[0], "mean_dof": df.n_comp.mean(), "bias": mean_bias, "variance": mean_variance, "ratio": mean_ratio, "error ratio": bootstrap_unc_ratio, "sqrt(ratio)": sqrt_ratio, "error sqrt(ratio)": bootstrap_unc_sqrt_ratio, } ) df = pd.DataFrame.from_records( records, index="dataset", columns=( "dataset", "mean_dof", "bias", "variance", "ratio", "error ratio", "sqrt(ratio)", "error sqrt(ratio)", ), ) return df
lambdavalues_bootstrapped_table_bias_variance_data = collect( "bootstrapped_table_bias_variance_data", ("lambdavalues",) )
[docs] @figuregen def plot_lambdavalues_bias_variance_values( lambdavalues_bootstrapped_table_bias_variance_datasets, lambdavalues, each_dataset ): """ Plot sqrt of ratio bias variance as a function of lambda for each dataset. Parameters ---------- lambdavalues_table_bias_variance_datasets: list list of data frames computed as per table_bias_variance_datasets. lambdavalues: list list specified in multiclosure_analysis.yaml each_dataset: list list of datasets Yields ------ figure """ for ds in each_dataset: fig, ax = plotutils.subplots() for i, lambdavalue in enumerate(lambdavalues): df = lambdavalues_bootstrapped_table_bias_variance_datasets[i] df = df[df.index == str(ds)] ax.errorbar( lambdavalue["lambda_value"], df["sqrt(ratio)"].values, yerr=df["error sqrt(ratio)"].values, color="blue", fmt='o', ) ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--") ax.set_ylabel(r"$R_{bv}$") ax.set_xlabel(r"$\lambda$") ax.set_title(f"{ds.commondata.metadata.plotting.dataset_label}") yield fig
[docs] @figure def plot_lambdavalues_bias_variance_values_full_data( lambdavalues_bootstrapped_table_bias_variance_data, lambdavalues ): """ Plot sqrt of ratio bias variance as a function of lambda for each dataset. Parameters ---------- lambdavalues_bootstrapped_table_bias_variance_data: list list of data frames computed as per table_bias_variance_data. lambdavalues: list list specified in multiclosure_analysis.yaml Returns ------- figure """ fig, ax = plotutils.subplots() for i, lambdavalue in enumerate(lambdavalues): df = lambdavalues_bootstrapped_table_bias_variance_data[i] ax.errorbar( lambdavalue["lambda_value"], df["sqrt(ratio)"].values, yerr=df["error sqrt(ratio)"].values, color="blue", fmt='o', ) ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--") ax.set_ylabel(r"$R_{bv}$") ax.set_xlabel(r"$\lambda$") ax.set_title("Full data") return fig
internal_multiclosure_data_collected_loader = collect( "internal_multiclosure_dataset_loader", ("data",) )
[docs] @figuregen def plot_l2_condition_number( each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20 ): """ Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio. The plot gives an idea of the stability of the covariance matrix as a function of the exaplained variance ratio and hence the number of principal components used to reduce the dimensionality. The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this threshold number (and the derived explained variance ratio) should be chosen so that relative error in output (inverse covmat) <= relative error in input (covmat) * condition number Note that in a closure test the relative error in the covariance matrix is very small and only numerical. Parameters ---------- each_dataset : list List of datasets internal_multiclosure_data_loader: list list of internal_multiclosure_dataset_loader objects Yields ------ fig Figure object """ # Explained variance ratio range evr_range = np.linspace(evr_min, evr_max, evr_n) for internal_multiclosure_dataset_loader, ds in zip( internal_multiclosure_data_collected_loader, each_dataset ): l2_cond = [] dof = [] for evr in evr_range: pca_loader = internal_multiclosure_dataset_loader_pca( internal_multiclosure_dataset_loader, explained_variance_ratio=evr ) # if the number of principal components is 1 then the covariance matrix is a scalar # and the condition number is not computed if pca_loader.n_comp == 1: l2_cond.append(np.nan) else: l2_cond.append(np.linalg.cond(pca_loader.covmat_pca)) dof.append(pca_loader.n_comp) fig, ax1 = plotutils.subplots() ax1.plot(evr_range, l2_cond, label="L2 Condition Number") ax1.set_title(f"Dataset: {str(ds)}") ax1.set_xlabel("Explained Variance Ratio") ax1.set_ylabel("Covariance Matrix Condition Number") ax1.tick_params('y') # Draw horizontal line for threshold L2 condition number ax1.axhline(y=100, color='g', linestyle='--', label="Threshold L2 condition number") ax2 = ax1.twinx() # Plot the second dataset on the right y-axis ax2.plot(evr_range, dof, color="r", label="Degrees of freedom") ax2.set_ylabel('Degrees of freedom') ax2.tick_params('y') lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() lines = lines1 + lines2 labels = labels1 + labels2 ax1.legend(lines, labels, loc='upper left') yield fig
[docs] @figure def delta_histogram(principal_components_normalized_delta_data, title, label_hist=None): """ Plot histogram of normalized delta regularized with PCA. Parameters ---------- principal_components_normalized_delta_data: tuple title: str description of multiclosure label_hist: str summary description of multiclosure Returns ------- fig Figure object """ fig, ax = plotutils.subplots() size = np.shape(principal_components_normalized_delta_data[0])[0] ax.hist( principal_components_normalized_delta_data[0], density=True, bins=int(np.sqrt(size)), label=label_hist, ) ax.set_title( r"$\delta$ distribution, " + title + f", dof={principal_components_normalized_delta_data[1]}" ) x = np.linspace(-3, 3, 100) y = norm.pdf(x, loc=0, scale=1) ax.plot(x, y, label="Standard gaussian") ax.legend() return fig
[docs] @table def table_xi_indicator_function_data(bootstrapped_indicator_function_data): """ Computes the bootstrap average and std of the indicator function for the data. Parameters ---------- bootstrapped_indicator_function_data: tuple Returns ------- pd.DataFrame DataFrame containing the average and std of the indicator function for the data. """ indicator_list, mean_dof = bootstrapped_indicator_function_data # average over data and fits within the bootstrap samples mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list]) # take bootstrap expectation and variance mean_xi = np.mean(mean_boot_vals) std_xi = np.std(mean_boot_vals) records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)] df = pd.DataFrame.from_records( records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi") ) return df