Source code for validphys.deltachi2

"""
deltachi2.py

Plots and data processing that can be used in a delta chi2 analysis
"""

import logging
import warnings

import numpy as np
import scipy as sp

from reportengine.checks import CheckError, make_argcheck
from reportengine.figure import figure, figuregen
from validphys import plotutils
from validphys.checks import check_pdf_normalize_to, check_pdfs_noband, check_scale
from validphys.core import PDF
from validphys.pdfplots import BandPDFPlotter, PDFPlotter

log = logging.getLogger(__name__)


[docs]@make_argcheck
def check_pdf_is_symmhessian(pdf, **kwargs):
    """Check ``pdf`` has error type of ``symmhessian``"""
    etype = pdf.error_type
    if etype != "symmhessian":
        raise CheckError(f"Error: type of PDF {pdf} must be 'symmhessian' and not {etype}")


[docs]@check_pdf_is_symmhessian
def delta_chi2_hessian(pdf, total_chi2_data):
    """
    Return delta_chi2 (computed as in plot_delta_chi2_hessian) relative to
    each eigenvector of the Hessian set.
    """
    delta_chi2 = (
        np.ravel(total_chi2_data.replica_result.error_members()) - total_chi2_data.central_result
    )
    return delta_chi2


[docs]@figure
def plot_kullback_leibler(delta_chi2_hessian):
    """
    Determines the Kullback–Leibler divergence by comparing the expectation value of Delta chi2 to
    the cumulative distribution function of chi-square distribution with one degree of freedom
    (see: https://en.wikipedia.org/wiki/Chi-square_distribution).

    The Kullback-Leibler divergence provides a measure of the difference between two distribution
    functions, here we compare the chi-squared distribution and the cumulative distribution of the
    expectation value of Delta chi2.
    """
    delta_chi2 = delta_chi2_hessian

    bins_nnpdf = np.arange(0, 7.5, 0.1)
    # find the midpoint of each bin, has length len(bins_nnpdf) - 1
    bin_central_nnpdf = (bins_nnpdf[1:] + bins_nnpdf[:-1]) / 2
    x = np.linspace(0, 7.4, 200)

    fig, ax = plotutils.subplots()

    vals_nnpdf, _, _ = ax.hist(
        delta_chi2,
        bins=bins_nnpdf,
        density=True,
        cumulative=True,
        label=r"cumulative $\Delta\chi^2$",
    )
    # compute Kullback-Leibler (null values set to 1e-8)
    vals_nnpdf[vals_nnpdf == 0] = 1e-8
    kl_nnpdf = sp.stats.entropy(sp.stats.chi2.cdf(bin_central_nnpdf, 1), qk=vals_nnpdf)

    ax.plot(x, sp.stats.chi2.cdf(x, 1), label=r"$\chi^2$ CDF")

    ax.set_title(f"KL divergence: {kl_nnpdf:.4f}")
    ax.set_xlabel(r"$<\Delta\chi^2>$")

    ax.legend()

    return fig


[docs]@figure
def plot_delta_chi2_hessian_eigenv(delta_chi2_hessian, pdf):
    """
    Plot of the chi2 difference between chi2 of each eigenvector of a symmHessian set
    and the central value for all experiments in a fit.
    As a function of every eigenvector in a first plot, and as a distribution in a second plot.
    """
    delta_chi2 = delta_chi2_hessian

    x = np.arange(1, len(delta_chi2) + 1)

    fig, ax = plotutils.subplots()

    ax.bar(x, delta_chi2, label=pdf.label)
    ax.set_xlabel("# Hessian PDF")
    ax.set_ylabel(r"$\Delta\chi^2$")
    ax.set_title(r"$\Delta\chi^2$ each eigenvector")

    ax.legend()

    return fig


[docs]@figure
def plot_delta_chi2_hessian_distribution(delta_chi2_hessian, pdf, total_chi2_data):
    """
    Plot of the chi2 difference between chi2 of each eigenvector of a symmHessian set
    and the central value for all experiments in a fit.
    As a function of every eigenvector in a first plot, and as a distribution in a second plot.
    """
    delta_chi2 = delta_chi2_hessian

    fig, ax = plotutils.subplots()

    bins = np.arange(np.floor(min(delta_chi2)), np.ceil(max(delta_chi2)) + 1)

    ax.hist(
        delta_chi2,
        bins=bins,
        label=fr"{pdf.label} - $\chi^2_0$={total_chi2_data.central_result:.0f}",
    )
    ax.set_xlabel(r"$\Delta\chi^2$")
    ax.set_title(r"$\Delta\chi^2$ distribution")

    return fig


[docs]def pos_neg_xplotting_grids(delta_chi2_hessian, xplotting_grid):
    """
    Generates xplotting_grids correspodning to positive and negative delta chi2s.
    """
    positive_eigenvalue_mask = delta_chi2_hessian >= 0

    # The masks do not include replica 0, add it in both grids
    pos_mask = np.append(True, positive_eigenvalue_mask)
    neg_mask = np.append(True, ~positive_eigenvalue_mask)

    pos_grid = xplotting_grid.grid_values.data[pos_mask]
    neg_grid = xplotting_grid.grid_values.data[neg_mask]

    # Wrap everything back into the original stats class
    stats_class = xplotting_grid.grid_values.__class__
    pos_xplotting_grid = xplotting_grid.copy_grid(stats_class(pos_grid))
    neg_xplotting_grid = xplotting_grid.copy_grid(stats_class(neg_grid))

    return [xplotting_grid, pos_xplotting_grid, neg_xplotting_grid]


[docs]@figuregen
@check_pdf_normalize_to
@check_pdfs_noband
@check_scale("xscale", allow_none=True)
def plot_pos_neg_pdfs(
    pdf,
    pos_neg_xplotting_grids,
    xscale: (str, type(None)) = None,
    normalize_to: (int, str, type(None)) = None,
    ymin=None,
    ymax=None,
    pdfs_noband: (list, type(None)) = None,
):
    """
    Plot the the uncertainty of the original hessian pdfs, as well as that of the positive and
    negative subset.
    """

    original_pdf = pdf.name
    # create fake PDF objects so we can reuse BandPDFPlotter
    pos_pdf = PDF(original_pdf)
    pos_pdf.label = f"{original_pdf}_pos"
    neg_pdf = PDF(original_pdf)
    neg_pdf.label = f"{original_pdf}_neg"

    pdfs = [pdf, pos_pdf, neg_pdf]

    yield from BandPDFPlotter(
        pdfs, pos_neg_xplotting_grids, xscale, normalize_to, ymin, ymax, pdfs_noband=pdfs_noband
    )


[docs]class PDFEpsilonPlotter(PDFPlotter):
    """Subclassing PDFPlotter in order to plot epsilon (measure of gaussanity)
    for multiple PDFs, yielding a separate figure for each flavour
    """

[docs]    def setup_flavour(self, flstate):
        flstate.labels = []
        flstate.handles = []

[docs]    def get_ylabel(self, parton_name):
        return r'$\epsilon(x)$'

[docs]    def draw(self, pdf, grid, flstate):
        """Obtains the gridvalues of epsilon (measure of Gaussianity)"""
        ax = flstate.ax
        flindex = flstate.flindex
        labels = flstate.labels
        handles = flstate.handles

        # Create a copy of the `Stats` instance of the grid
        # with only the flavours we are interested in
        gv = grid.grid_values.data
        stats = grid(grid_values=gv[:, flindex, :]).grid_values

        # Ignore spurious normalization warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            error68down, error68up = stats.errorbar68()  # 68% error bands

        errorstd = stats.std_error()
        # color cycle iterable
        color = ax._get_lines.get_next_color()
        xgrid = grid.xgrid
        # the division by 2 is equivalent to considering the complete 1-sigma band (2 * error_std)
        error68 = (error68up - error68down) / 2.0
        epsilon = abs(1 - errorstd / error68)

        (handle,) = ax.plot(xgrid, epsilon, linestyle="-", color=color)

        handles.append(handle)
        labels.append(pdf.label)

        return [5 * epsilon]

[docs]    def legend(self, flstate):
        return flstate.ax.legend(
            flstate.handles,
            flstate.labels,
            handler_map={plotutils.HandlerSpec: plotutils.ComposedHandler()},
        )


[docs]@make_argcheck
def check_pdfs_are_montecarlo(pdfs, **kwargs):
    """Checks that the action is applied only to a pdf consisiting of MC replicas."""
    for pdf in pdfs:
        etype = pdf.error_type
        if etype != "replicas":
            raise CheckError(f"Error: type of PDF {pdf} must be 'replicas' and not '{etype}'")


[docs]@figuregen
@check_pdfs_are_montecarlo
@check_scale("xscale", allow_none=True)
def plot_epsilon(
    pdfs, xplotting_grids, xscale: (str, type(None)) = None, ymin=None, ymax=None, eps=None
):
    """Plot the discrepancy (epsilon) of the 1-sigma and 68% bands at each grid value
    for all pdfs for a given Q. See https://arxiv.org/abs/1505.06736 eq. (11)

    xscale is read from pdf plotting_grid scale, which is 'log' by default.

    eps defines the value at which plot a simple hline
    """
    yield from PDFEpsilonPlotter(
        pdfs, xplotting_grids, xscale, normalize_to=None, ymin=ymin, ymax=ymax
    )