Source code for validphys.reweighting

"""
Utilities for reweighting studies.

Implements utilities for calculating the NNPDF weights and unweighted PDF sets.
It also allows for some basic statistics.
"""

from collections import OrderedDict
import logging
import warnings

import numpy as np
import pandas as pd
from scipy import optimize

from reportengine import collect
from reportengine.checks import check_not_empty
from reportengine.figure import figure
from reportengine.table import table
from validphys import checks, plotutils
from validphys.core import PDF, Filter
from validphys.dataplots import plot_training_validation
from validphys.lhio import new_pdf_from_indexes
from validphys.pdfoutput import pdfset

log = logging.getLogger(__name__)

__all__ = (
    'chi2_data_for_reweighting_experiments',
    'make_unweighted_pdf',
    'nnpdf_weights',
    'nnpdf_weights_numerator',
    'p_alpha_study',
    'plot_p_alpha',
    'reweighting_stats',
    'unweighted_index',
    'make_pdf_from_filtered_outliers',
)


chi2_data_for_reweighting_experiments_inner = collect(
    'abs_chi2_data_experiment', ['reweighting_experiments']
)


# This is to add checks and to request that use_t0 is set explicitly
# pylint: disable=unused-argument
[docs]@checks.check_pdf_is_montecarlo
def chi2_data_for_reweighting_experiments(chi2_data_for_reweighting_experiments_inner, use_t0):
    return chi2_data_for_reweighting_experiments_inner


[docs]def nnpdf_weights_numerator(chi2_data_for_reweighting_experiments):
    """Compute the numerator of the NNPDF weights. This is useful for P(α),
    which uses a different normalization."""
    total_ndata = 0
    chi2s = np.zeros_like(chi2_data_for_reweighting_experiments[0][0].error_members())
    for data in chi2_data_for_reweighting_experiments:
        res, _, ndata = data
        total_ndata += ndata
        chi2s += res.data

    chi2s = np.ravel(chi2s)

    logw = ((total_ndata - 1) / 2) * np.log(chi2s) - 0.5 * chi2s
    logw -= np.max(logw)
    w = np.exp(logw)
    return w


[docs]@table
# will call list[0]
@check_not_empty('reweighting_experiments')
def nnpdf_weights(chi2_data_for_reweighting_experiments):
    """Compute the replica weights according to the NNPDF formula."""
    numerator = nnpdf_weights_numerator(chi2_data_for_reweighting_experiments)
    return pd.DataFrame(numerator / np.sum(numerator), index=np.arange(1, len(numerator) + 1))


def effective_number_of_replicas(w):
    N = len(w)
    w = w * N / np.sum(w)
    return np.exp(np.nansum(w * np.log(N) - w * np.log(w)) / N)


[docs]@table
def reweighting_stats(pdf, nnpdf_weights, p_alpha_study):
    """Compute various statistics related to reweighting.

    Those are:
     - Number of initial replicas.
     - Effective number of replicas.
     - Median of the weightd.
     - The maximum value of P(alpha) in some sensible range.
    """
    er = effective_number_of_replicas(nnpdf_weights)
    initial_replicas = pdf.get_members()
    median = np.median(nnpdf_weights)
    max_alpha = p_alpha_study.idxmax()

    result = OrderedDict(
        [
            (r'$N_{\mathrm{initial}}$', initial_replicas),
            (r'$N_{eff}$', er),
            (r'median($w$)', median),
            (r'$max_{[0.5,4]}P(\alpha)$', max_alpha),
        ]
    )

    return pd.Series(result, index=result.keys(), name='Reweighting stats')


def _get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments):
    new_chi2 = [
        ((type(res)(res.error_members() / alpha**2)), central, ndata)
        for (res, central, ndata) in chi2_data_for_reweighting_experiments
    ]

    new_ws = nnpdf_weights_numerator(new_chi2)
    val = np.sum(new_ws / alpha)
    return val


def _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments):
    vals = []
    for alpha in alphas:
        val = _get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments)
        vals.append(val)

    return vals


[docs]def p_alpha_study(chi2_data_for_reweighting_experiments):
    """Compute P(α) in an automatic range"""

    small = 0.5

    f = lambda alpha: -_get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments)
    # Ignore warnings for nonsensical alphas
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        alphamax = optimize.fmin(f, 10, disp=False)[0]

    """
    #First find the maximum
    small = 0.5

    limclose=4
    #No, 100 is not a crazy big value
    limfar = 100

    nclose = 80
    nfar = 40

    alphas = np.zeros(nclose+nfar)
    alphas[:nclose] = np.linspace(small, limclose, nclose, endpoint=False)
    alphas[nclose:] = np.linspace(limclose,limfar,nfar)
    vals = _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments)
    big = 1.2*alphas[np.argmax(vals)]

    #Compute more points around the maximum
    alphas = np.linspace(small, big, 100)
    """

    big = 1.4 * alphamax

    alphas = np.linspace(small, big, 1000)
    vals = _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments)

    return pd.Series(np.array(vals), index=alphas)


[docs]@figure
def plot_p_alpha(p_alpha_study):
    """Plot the results of ``p_alpha_study``."""
    fig, ax = plotutils.subplots()
    ax.set_title(r"$P(\alpha)$")

    xmax = p_alpha_study.idxmax()
    ymax = p_alpha_study[xmax]
    ax.axvline(xmax, color='red', linestyle='--')
    ax.annotate(r'$\alpha=%.2f$' % xmax, (xmax, (ymax - ax.get_ylim()[0]) / 2))

    ax.set_yticklabels([])
    ax.set_xlabel(r'$\alpha$')

    ax.plot(p_alpha_study)
    return fig


[docs]@table
def unweighted_index(nnpdf_weights, nreplicas: int = 100):
    """The index of the input replicas that corresponds to an unweighted set,
    for the given weights. This can be saved for testing purposes."""
    nnpdf_weights = np.ravel(nnpdf_weights)
    res = 1 + np.random.choice(len(nnpdf_weights), size=nreplicas, p=nnpdf_weights)
    return pd.DataFrame(res, index=np.arange(1, nreplicas + 1))


[docs]@pdfset
@checks.check_can_save_grid
def make_unweighted_pdf(
    pdf, unweighted_index, set_name: str, output_path=None, installgrid: bool = True
):
    """Generate an unweighted PDF set, from the prior ``pdf`` and the
    reweighting_experiments. The PDF is written to a `pdfsets` directory of
    the output folder. Return the relative path of the newly created PDF."""

    new_pdf_from_indexes(
        pdf=pdf,
        indexes=np.ravel(unweighted_index),
        set_name=set_name,
        folder=output_path,
        installgrid=installgrid,
    )


@checks.make_check
def _check_cut(ns, *args, **kwargs):
    cut = ns['nsigma_cut']
    msg = "'nsigma_cut' must be a float greater than zero."
    try:
        if cut > 0:
            return
    # TODO: Check types for parameters automatically
    except TypeError as e:
        raise checks.CheckError(msg) from e
    raise checks.CheckError(msg)


@checks.check_has_fitted_replicas
@_check_cut
def chi2filtered_index(fit, replica_data, nsigma_cut: float):
    """Discard outliers until the χ² to fitted data of all replicas has
    a smaller deviation than `nsigma_cut`
    (in units of the standard deviation of the final ensemble)
    """

    chis = np.array([dt.chi2 for dt in replica_data])
    newchis = chis
    oldmean = np.mean(newchis)
    while True:
        mean = np.mean(newchis)
        std = np.std(newchis)
        limit = mean + std * nsigma_cut
        # Replica indexes start at 1
        indexes = np.where(chis < limit)[0]

        if len(newchis) == len(indexes):
            break

        newchis = chis[indexes]

    log.info(
        "%s: Mean ̉χ² was %.2f, and the threshold is %.2f. Discarded %d "
        "replicas out of %d. Now the mean is %.2f.",
        fit.label,
        oldmean,
        limit,
        len(chis) - len(indexes),
        len(chis),
        np.mean(newchis),
    )

    label = r'$\chi^2$ Filter: %.2f' % nsigma_cut

    return Filter(indexes, label, nsigma_cut=nsigma_cut)


def negative_filtered_index(count_negative_points, filter_Q=75):
    cut = np.percentile(count_negative_points, filter_Q)
    indexes = np.where(count_negative_points <= cut)[0]
    label = 'Positivity Selection Q=%.2f' % filter_Q
    log.info(f"Positivity cut for Q={filter_Q:.2f} is at {cut:.0f} negative points")
    return Filter(indexes, label, filter_Q=filter_Q)


[docs]@pdfset
@checks.check_can_save_grid
def make_pdf_from_filtered_outliers(
    fit, chi2filtered_index, set_name: str, output_path=None, installgrid: bool = True
):
    """Produce a new grid with the result of chi2filtered_index"""

    indexes = chi2filtered_index.indexes + 1  # libnnpdf nonsense
    new_pdf_from_indexes(
        pdf=PDF(fit.name),
        indexes=indexes,
        set_name=set_name,
        folder=output_path,
        installgrid=installgrid,
    )


# TODO: Use the filter framework here when it exists
@figure
def plot_chi2filtered_training_validation(fit, nsigma_cut, replica_data, chi2filtered_index):
    """Like `plot_training_validation`, but apply `chi2filtered_index` mask."""
    return plot_training_validation(fit, replica_data, dict([chi2filtered_index.as_pair()]))


# TODO: Use the filter framework here when it exists
@figure
def plot_posfiltered_training_validation(fit, replica_data, negative_filtered_index):
    """Like `plot_training_validation`, but apply `chi2filtered_index` mask."""
    return plot_training_validation(fit, replica_data, dict([negative_filtered_index.as_pair()]))


p_alpha_for_all_datasets = collect(p_alpha_study, ('reweight_all_datasets',))

chi_2_for_all_datasets = collect(chi2_data_for_reweighting_experiments, ('reweight_all_datasets',))


@table
def p_alpha_all_datasets_table(
    p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets
):
    """Compute and display P(alpha) and chi² for all datasets
    in all experiments."""
    data = []
    chilabel = r'$\chi^2$'
    modelabel = r'mode P($\alpha$)'
    for series, rexp, chis in zip(
        p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets
    ):
        central = chis[0][1] / chis[0][2]
        exp = rexp['reweighting_experiments'][0]
        data.append({'name': exp.datasets[0].name, modelabel: series.argmax(), chilabel: central})
    return pd.DataFrame.from_records(data, index='name', columns=('name', modelabel, chilabel))