Source code for validphys.overfit_metric

"""
overfit_metric.py

This module contains the functions used to calculate the overfit metric and
produce the corresponding tables and figures.
"""

import logging

import numpy as np
import packaging
import pandas as pd
import scipy.stats as stats

from reportengine import collect
from reportengine.figure import figure
from reportengine.table import table
from validphys import plotutils
from validphys.checks import check_at_least_two_replicas

log = logging.getLogger(__name__)

preds = collect("predictions", ("dataset_inputs",))


def _create_new_val_pseudodata(pdf_data_index, fit_data_indices_list):
    """Loads all validation pseudodata replicas used during the fiting of the
    pdf replicas

    Returns
    -------
    np.ndarray
        (nrep,ndata) sized numpy array containing the validation data used to
        fit the pdfs.
    """
    vl_data_fitrep = []
    for fitreplica_info in fit_data_indices_list:
        vl_data_fitrep.append(fitreplica_info.pseudodata.loc[pdf_data_index.val_idx])
    return np.array(vl_data_fitrep)[:, :, 0]


[docs]@check_at_least_two_replicas
def calculate_chi2s_per_replica(
    pdf,  # for the check
    fit_code_version,
    recreate_pdf_pseudodata_no_table,
    preds,
    dataset_inputs,
    groups_covmat_no_table,
):
    """Calculates, for each PDF replica, the chi2 of the validation with the
    pseudodata generated for all other replicas in the fit

    Parameters
    ----------
    recreate_pdf_pseudodata_no_table : list[namedtuple]
        List of namedtuples, each of which contains a dataframe
        containing all the data points, the training indices, and
        the validation indices.
    preds : list[pd.core.frame.DataFrame]
        List of pandas dataframes, each containing the predictions of the pdf
        replicas for a dataset_input
    dataset_inputs : list[DataSetInput]
    groups_covmat_no_table : pdf.core.frame.DataFrame

    Returns
    -------
    np.ndarray
        (Npdfs, Npdfs) sized matrix containing the chi2 of a pdf replica
        calculated to a given psuedodata replica. The diagonal values correspond
        to the cases where the PDF replica has been fitted to the coresponding
        pseudodata replica
    """
    fit_name = fit_code_version.columns[0]
    nnpdf_version = packaging.version.parse(fit_code_version[fit_name]['nnpdf'])
    if nnpdf_version < packaging.version.parse("4.0.5"):
        raise ValueError(
            "The overfit metric can only be calculated with fits starting from version 4.0.5"
        )

    pp = []
    for i, dss in enumerate(dataset_inputs):
        preds_witout_cv = preds[i].drop(0, axis=1)
        df = pd.concat({dss.name: preds_witout_cv}, names=["dataset"])
        pp.append(df)

    PDF_predictions = pd.concat(pp)

    chi2s_per_replica = []
    for enum, pdf_data_index in enumerate(recreate_pdf_pseudodata_no_table):
        prediction_filter = pdf_data_index.val_idx.droplevel(level=0)
        prediction_filter.rename(["dataset", "data"], inplace=True)
        PDF_predictions_val = PDF_predictions.loc[prediction_filter]
        PDF_predictions_val = PDF_predictions_val.values[:, enum]

        new_val_pseudodata_list = _create_new_val_pseudodata(
            pdf_data_index, recreate_pdf_pseudodata_no_table
        )

        invcovmat_vl = np.linalg.inv(
            groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx]
        )

        tmp = PDF_predictions_val - new_val_pseudodata_list

        chi2 = np.einsum("ij,jk,ik->i", tmp, invcovmat_vl, tmp) / tmp.shape[1]
        chi2s_per_replica.append(chi2)
        ret = np.array(chi2s_per_replica)

    return ret


[docs]def array_expected_overfitting(
    calculate_chi2s_per_replica, replica_data, number_of_resamples=1000, resampling_fraction=0.95
):
    """Calculates the expected difference in chi2 between:
    1. The chi2 of a PDF replica calculated using the corresponding pseudodata
        replica used during the fit
    2. The chi2 of a PDF replica calculated using an alternative i.i.d random
        pseudododata replicas

    The expected difference along with an error estimate is obtained through a
    bootstrapping consisting of ``number_of_resamples`` resamples per pdf replica
    where each resampling contains a fraction ``resampling_fraction`` of all
    replicas.

    Parameters
    ----------
    calculate_chi2s_per_replica : np.ndarray
        validation chi2 per pdf replica
    replica_data : list(vp.fitdata.FitInfo)
    number_of_resamples : int, optional
        number of resamples per pdf replica, by default 1000
    resampling_fraction : float, optional
        fraction of replicas used in the bootstrap resampling, by default 0.95

    Returns
    -------
    np.ndarray
        (number_of_resamples*Npdfs,) sized array containing the mean delta chi2
        values per resampled list.
    """
    # calculate_chi2s_per_replica is set to NaN if the pseudodata generation
    # has changed sinc the fit has been performed. As a result the overfitting
    # metric can no longer be determined.
    if (calculate_chi2s_per_replica != calculate_chi2s_per_replica).all():
        list_expected_overfitting = calculate_chi2s_per_replica
    else:
        fitted_val_erf = np.array([info.validation for info in replica_data])

        number_pdfs = calculate_chi2s_per_replica.shape[0]
        list_expected_overfitting = []
        for _ in range(number_pdfs * number_of_resamples):
            mask = np.random.randint(0, number_pdfs, size=int(resampling_fraction * number_pdfs))
            res_tmp = calculate_chi2s_per_replica[mask][:, mask]

            fitted_val_erf_tmp = fitted_val_erf[mask]
            expected_val_chi2 = res_tmp.mean(axis=0)
            delta_chi2 = fitted_val_erf_tmp - expected_val_chi2
            expected_delta_chi2 = delta_chi2.mean()

            list_expected_overfitting.append(expected_delta_chi2)
    return np.array(list_expected_overfitting)


[docs]@figure
def plot_overfitting_histogram(fit, array_expected_overfitting):
    """Plots the bootrap error and central value of the overfittedness in a
    historgram"""
    mean = array_expected_overfitting.mean()
    std = array_expected_overfitting.std()

    fig, ax = plotutils.subplots()

    # if array_expected_overfitting is nan it should not produce a histogram
    if (array_expected_overfitting == array_expected_overfitting).all():
        ax.hist(array_expected_overfitting, bins=50, density=True)
        ax.axvline(x=mean, color="black")
        ax.axvline(x=0, color="black", linestyle="--")
        xrange = [array_expected_overfitting.min(), array_expected_overfitting.max()]
        xgrid = np.linspace(xrange[0], xrange[1], num=100)
        ax.plot(xgrid, stats.norm.pdf(xgrid, mean, std))
        ax.set_xlabel(r"$\mathcal{R}_O$")
        ax.set_ylabel("density")
        ax.set_title(f"{fit.label}")
        fig.tight_layout()
    return fig


fits_overfitting_summary = collect("fit_overfitting_summary", ("fits", "fitcontext"))


[docs]@table
def fit_overfitting_summary(fit, array_expected_overfitting):
    """Creates a table containing the overfitting information:
    - mean chi2 difference
    - bootstrap error
    - sigmas away from 0
    """
    mean = array_expected_overfitting.mean()
    std = array_expected_overfitting.std()
    return pd.DataFrame(
        [mean, std, mean / std],
        columns=[fit.label],
        index=["mean", "bootstrap error", "sigmas away from 0"],
    )


[docs]@table
def summarise_overfitting(fits_overfitting_summary):
    """Same as `fit_overfitting_summary`, but collected over all `fits` in the
    runcard and put in a single table.
    """
    return pd.concat(fits_overfitting_summary, axis=1)