Source code for validphys.closuretest.multiclosure_inconsistent_output

"""
multiclosure_inconsistent_output

Module containing the actions which produce some output in validphys
reports i.e figures or tables for (inconsistent) multiclosure 
estimators in the space of data

"""

import numpy as np

from reportengine.figure import figuregen, figure
from reportengine import collect

from validphys import plotutils
from validphys.closuretest.multiclosure import regularized_multiclosure_dataset_loader


"""
Collects bootstrapped_table_bias_datasets over multiple lambda values dataspecs.
"""
lambdavalues_bootstrapped_table_bias_datasets = collect(
    "bootstrapped_table_bias_datasets", ("lambdavalues",)
)


"""
Collects bootstrapped_table_bias_data over multiple lambda values dataspecs.
"""
lambdavalues_bootstrapped_table_bias_data = collect(
    "bootstrapped_table_bias_data", ("lambdavalues",)
)


[docs]@figuregen
def plot_lambdavalues_bias_values(
    lambdavalues_bootstrapped_table_bias_datasets, lambdavalues, each_dataset
):
    """
    Plot sqrt of bias and its bootstrap uncertainty as a function of lambda for each dataset.

    Parameters
    ----------
    lambdavalues_bootstrapped_table_bias_datasets: list
        list of data frames computed as per table_bias_datasets.

    lambdavalues: list
        list specified in multiclosure_analysis.yaml

    each_dataset: list
        list of datasets

    Yields
    ------
    figure
    """

    for ds in each_dataset:
        fig, ax = plotutils.subplots()
        for i, lambdavalue in enumerate(lambdavalues):
            df = lambdavalues_bootstrapped_table_bias_datasets[i]
            df = df[df.index == str(ds)]

            ax.errorbar(
                1 - lambdavalue["lambda_value"],
                df["sqrt bias"].values[::-1],
                yerr=df["err sqrt bias"].values,
                color="blue",
                fmt='o',
            )
            ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--")
            ax.set_ylabel(r"$R_{b}$")
            ax.set_xlabel(r"$1 - \lambda$")

        ax.set_title(f"{ds.commondata.metadata.plotting.dataset_label}")

        yield fig


[docs]@figure
def plot_lambdavalues_bias_values_full_data(
    lambdavalues_bootstrapped_table_bias_data, lambdavalues
):
    """
    Plot sqrt of bias and its bootstrap uncertainty as a function of lambda for the full dataset.

    Parameters
    ----------
    lambdavalues_bootstrapped_table_bias_data: list
        list of data frames computed as per table_bias_data.

    lambdavalues: list
        list specified in multiclosure_analysis.yaml

    Returns
    -------
    figure
    """
    fig, ax = plotutils.subplots()
    for i, lambdavalue in enumerate(lambdavalues):
        df = lambdavalues_bootstrapped_table_bias_data[i]

        ax.errorbar(
            1 - lambdavalue["lambda_value"],
            df["sqrt bias"].values[::-1],
            yerr=df["err sqrt bias"].values,
            color="blue",
            fmt='o',
        )
        ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--")
        ax.set_ylabel(r"$R_{b}$")
        ax.set_xlabel(r"$1- \lambda$")

    ax.set_title("Full data")

    return fig


internal_multiclosure_data_collected_loader = collect("multiclosure_dataset_loader", ("data",))


[docs]@figuregen
def plot_l2_condition_number(
    each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20
):
    """
    Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
    The plot gives an idea of the stability of the covariance matrix as a function of the
    exaplained variance ratio and hence the number of principal components used to reduce the dimensionality.

    The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this
    threshold number (and the derived explained variance ratio) should be chosen so that

    relative error in output (inverse covmat) <= relative error in input (covmat) * condition number
    Note that in a closure test the relative error in the covariance matrix is very small and only numerical.

    Parameters
    ----------
    each_dataset : list
        List of datasets

    multiclosure_data_loader: list
        list of multiclosure_dataset_loader objects


    Yields
    ------
    fig
        Figure object
    """

    # Explained variance ratio range
    evr_range = np.linspace(evr_min, evr_max, evr_n)

    for multiclosure_dataset_loader, ds in zip(
        internal_multiclosure_data_collected_loader, each_dataset
    ):
        l2_cond = []
        dof = []

        for evr in evr_range:

            pca_loader = regularized_multiclosure_dataset_loader(
                multiclosure_dataset_loader, explained_variance_ratio=evr
            )

            # if the number of principal components is 1 then the covariance matrix is a scalar
            # and the condition number is not computed
            if pca_loader.n_comp == 1:
                l2_cond.append(np.nan)
            else:
                l2_cond.append(np.linalg.cond(pca_loader.reg_covmat_reps_mean))

            dof.append(pca_loader.n_comp)

        fig, ax1 = plotutils.subplots()
        ax1.plot(evr_range, l2_cond, label="L2 Condition Number")
        ax1.set_title(f"Dataset: {str(ds)}")
        ax1.set_xlabel("Explained Variance Ratio")
        ax1.set_ylabel("Covariance Matrix Condition Number")
        ax1.tick_params('y')

        # Draw horizontal line for threshold L2 condition number
        ax1.axhline(y=100, color='g', linestyle='--', label="Threshold L2 condition number")

        ax2 = ax1.twinx()

        # Plot the second dataset on the right y-axis
        ax2.plot(evr_range, dof, color="r", label="Degrees of freedom")
        ax2.set_ylabel('Degrees of freedom')
        ax2.tick_params('y')

        lines1, labels1 = ax1.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        lines = lines1 + lines2
        labels = labels1 + labels2
        ax1.legend(lines, labels, loc='upper left')

        yield fig