"""
Utilities for reweighting studies.
Implements utilities for calculating the NNPDF weights and unweighted PDF sets.
It also allows for some basic statistics.
"""
from collections import OrderedDict
import logging
import warnings
import numpy as np
import pandas as pd
from scipy import optimize
from reportengine import collect
from reportengine.checks import check_not_empty
from reportengine.figure import figure
from reportengine.table import table
from validphys import checks, plotutils
from validphys.core import PDF, Filter
from validphys.dataplots import plot_training_validation
from validphys.lhio import new_pdf_from_indexes
from validphys.pdfoutput import pdfset
log = logging.getLogger(__name__)
__all__ = (
'chi2_data_for_reweighting_experiments',
'make_unweighted_pdf',
'nnpdf_weights',
'nnpdf_weights_numerator',
'p_alpha_study',
'plot_p_alpha',
'reweighting_stats',
'unweighted_index',
'make_pdf_from_filtered_outliers',
)
chi2_data_for_reweighting_experiments_inner = collect(
'abs_chi2_data_experiment', ['reweighting_experiments']
)
# This is to add checks and to request that use_t0 is set explicitly
# pylint: disable=unused-argument
[docs]
@checks.check_pdf_is_montecarlo
def chi2_data_for_reweighting_experiments(chi2_data_for_reweighting_experiments_inner, use_t0):
return chi2_data_for_reweighting_experiments_inner
[docs]
def nnpdf_weights_numerator(chi2_data_for_reweighting_experiments):
"""Compute the numerator of the NNPDF weights. This is useful for P(α),
which uses a different normalization."""
total_ndata = 0
chi2s = np.zeros_like(chi2_data_for_reweighting_experiments[0][0].error_members())
for data in chi2_data_for_reweighting_experiments:
res, _, ndata = data
total_ndata += ndata
chi2s += res.data
chi2s = np.ravel(chi2s)
logw = ((total_ndata - 1) / 2) * np.log(chi2s) - 0.5 * chi2s
logw -= np.max(logw)
w = np.exp(logw)
return w
[docs]
@table
# will call list[0]
@check_not_empty('reweighting_experiments')
def nnpdf_weights(chi2_data_for_reweighting_experiments):
"""Compute the replica weights according to the NNPDF formula."""
numerator = nnpdf_weights_numerator(chi2_data_for_reweighting_experiments)
return pd.DataFrame(numerator / np.sum(numerator), index=np.arange(1, len(numerator) + 1))
def effective_number_of_replicas(w):
N = len(w)
w = w * N / np.sum(w)
return np.exp(np.nansum(w * np.log(N) - w * np.log(w)) / N)
[docs]
@table
def reweighting_stats(pdf, nnpdf_weights, p_alpha_study):
"""Compute various statistics related to reweighting.
Those are:
- Number of initial replicas.
- Effective number of replicas.
- Median of the weightd.
- The maximum value of P(alpha) in some sensible range.
"""
er = effective_number_of_replicas(nnpdf_weights)
initial_replicas = pdf.get_members()
median = np.median(nnpdf_weights)
max_alpha = p_alpha_study.idxmax()
result = OrderedDict(
[
(r'$N_{\mathrm{initial}}$', initial_replicas),
(r'$N_{eff}$', er),
(r'median($w$)', median),
(r'$max_{[0.5,4]}P(\alpha)$', max_alpha),
]
)
return pd.Series(result, index=result.keys(), name='Reweighting stats')
def _get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments):
new_chi2 = [
((type(res)(res.error_members() / alpha**2)), central, ndata)
for (res, central, ndata) in chi2_data_for_reweighting_experiments
]
new_ws = nnpdf_weights_numerator(new_chi2)
val = np.sum(new_ws / alpha)
return val
def _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments):
vals = []
for alpha in alphas:
val = _get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments)
vals.append(val)
return vals
[docs]
def p_alpha_study(chi2_data_for_reweighting_experiments):
"""Compute P(α) in an automatic range"""
small = 0.5
f = lambda alpha: -_get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments)
# Ignore warnings for nonsensical alphas
with warnings.catch_warnings():
warnings.simplefilter('ignore')
alphamax = optimize.fmin(f, 10, disp=False)[0]
"""
#First find the maximum
small = 0.5
limclose=4
#No, 100 is not a crazy big value
limfar = 100
nclose = 80
nfar = 40
alphas = np.zeros(nclose+nfar)
alphas[:nclose] = np.linspace(small, limclose, nclose, endpoint=False)
alphas[nclose:] = np.linspace(limclose,limfar,nfar)
vals = _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments)
big = 1.2*alphas[np.argmax(vals)]
#Compute more points around the maximum
alphas = np.linspace(small, big, 100)
"""
big = 1.4 * alphamax
alphas = np.linspace(small, big, 1000)
vals = _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments)
return pd.Series(np.array(vals), index=alphas)
[docs]
@figure
def plot_p_alpha(p_alpha_study):
"""Plot the results of ``p_alpha_study``."""
fig, ax = plotutils.subplots()
ax.set_title(r"$P(\alpha)$")
xmax = p_alpha_study.idxmax()
ymax = p_alpha_study[xmax]
ax.axvline(xmax, color='red', linestyle='--')
ax.annotate(r'$\alpha=%.2f$' % xmax, (xmax, (ymax - ax.get_ylim()[0]) / 2))
ax.set_yticklabels([])
ax.set_xlabel(r'$\alpha$')
ax.plot(p_alpha_study)
return fig
[docs]
@table
def unweighted_index(nnpdf_weights, nreplicas: int = 100):
"""The index of the input replicas that corresponds to an unweighted set,
for the given weights. This can be saved for testing purposes."""
nnpdf_weights = np.ravel(nnpdf_weights)
res = 1 + np.random.choice(len(nnpdf_weights), size=nreplicas, p=nnpdf_weights)
return pd.DataFrame(res, index=np.arange(1, nreplicas + 1))
[docs]
@pdfset
@checks.check_can_save_grid
def make_unweighted_pdf(
pdf, unweighted_index, set_name: str, output_path=None, installgrid: bool = True
):
"""Generate an unweighted PDF set, from the prior ``pdf`` and the
reweighting_experiments. The PDF is written to a `pdfsets` directory of
the output folder. Return the relative path of the newly created PDF."""
new_pdf_from_indexes(
pdf=pdf,
indexes=np.ravel(unweighted_index),
set_name=set_name,
folder=output_path,
installgrid=installgrid,
)
@checks.make_check
def _check_cut(ns, *args, **kwargs):
cut = ns['nsigma_cut']
msg = "'nsigma_cut' must be a float greater than zero."
try:
if cut > 0:
return
# TODO: Check types for parameters automatically
except TypeError as e:
raise checks.CheckError(msg) from e
raise checks.CheckError(msg)
@checks.check_has_fitted_replicas
@_check_cut
def chi2filtered_index(fit, replica_data, nsigma_cut: float):
"""Discard outliers until the χ² to fitted data of all replicas has
a smaller deviation than `nsigma_cut`
(in units of the standard deviation of the final ensemble)
"""
chis = np.array([dt.chi2 for dt in replica_data])
newchis = chis
oldmean = np.mean(newchis)
while True:
mean = np.mean(newchis)
std = np.std(newchis)
limit = mean + std * nsigma_cut
# Replica indexes start at 1
indexes = np.where(chis < limit)[0]
if len(newchis) == len(indexes):
break
newchis = chis[indexes]
log.info(
"%s: Mean ̉χ² was %.2f, and the threshold is %.2f. Discarded %d "
"replicas out of %d. Now the mean is %.2f.",
fit.label,
oldmean,
limit,
len(chis) - len(indexes),
len(chis),
np.mean(newchis),
)
label = r'$\chi^2$ Filter: %.2f' % nsigma_cut
return Filter(indexes, label, nsigma_cut=nsigma_cut)
def negative_filtered_index(count_negative_points, filter_Q=75):
cut = np.percentile(count_negative_points, filter_Q)
indexes = np.where(count_negative_points <= cut)[0]
label = 'Positivity Selection Q=%.2f' % filter_Q
log.info(f"Positivity cut for Q={filter_Q:.2f} is at {cut:.0f} negative points")
return Filter(indexes, label, filter_Q=filter_Q)
[docs]
@pdfset
@checks.check_can_save_grid
def make_pdf_from_filtered_outliers(
fit, chi2filtered_index, set_name: str, output_path=None, installgrid: bool = True
):
"""Produce a new grid with the result of chi2filtered_index"""
indexes = chi2filtered_index.indexes + 1 # libnnpdf nonsense
new_pdf_from_indexes(
pdf=PDF(fit.name),
indexes=indexes,
set_name=set_name,
folder=output_path,
installgrid=installgrid,
)
# TODO: Use the filter framework here when it exists
@figure
def plot_chi2filtered_training_validation(fit, nsigma_cut, replica_data, chi2filtered_index):
"""Like `plot_training_validation`, but apply `chi2filtered_index` mask."""
return plot_training_validation(fit, replica_data, dict([chi2filtered_index.as_pair()]))
# TODO: Use the filter framework here when it exists
@figure
def plot_posfiltered_training_validation(fit, replica_data, negative_filtered_index):
"""Like `plot_training_validation`, but apply `chi2filtered_index` mask."""
return plot_training_validation(fit, replica_data, dict([negative_filtered_index.as_pair()]))
p_alpha_for_all_datasets = collect(p_alpha_study, ('reweight_all_datasets',))
chi_2_for_all_datasets = collect(chi2_data_for_reweighting_experiments, ('reweight_all_datasets',))
@table
def p_alpha_all_datasets_table(
p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets
):
"""Compute and display P(alpha) and chi² for all datasets
in all experiments."""
data = []
chilabel = r'$\chi^2$'
modelabel = r'mode P($\alpha$)'
for series, rexp, chis in zip(
p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets
):
central = chis[0][1] / chis[0][2]
exp = rexp['reweighting_experiments'][0]
data.append({'name': exp.datasets[0].name, modelabel: series.argmax(), chilabel: central})
return pd.DataFrame.from_records(data, index='name', columns=('name', modelabel, chilabel))