Source code for validphys.closuretest.multiclosure_nsigma

r"""
This module contains the functions used in Sec. 4 of paper: arXiv: 2503.17447

`set_1`, `set_2`, and `set_3` correspond to \(\S_1\), \(\S_2\), and \(\S_3\) in Eq. 4.3, 4.6, and 4.7.
"""

import dataclasses

import numpy as np
import pandas as pd
from scipy.stats import norm

import reportengine
from reportengine import collect
from validphys.closuretest.closure_checks import (
    check_fits_areclosures,
    check_fits_underlying_law_match,
)

Z_ALPHA_RANGE = norm.ppf(1 - np.linspace(0, 0.5, 100))
"""
Quantile range for computing the true positive rate and true negative rate.
"""


[docs]@dataclasses.dataclass class MulticlosureNsigma: """ Dataclass containing nsigma values for all datasets and fits, also used to keep track on whether the multiclosure fit is weighted or not. Attributes ---------- nsigma_table: pd.DataFrame A table containing n_sigma values. is_weighted: bool Whether the fit was weighted. """ nsigma_table: pd.DataFrame is_weighted: bool
[docs]@check_fits_areclosures @check_fits_underlying_law_match def multiclosurefits_nsigma( fits: reportengine.namespaces.NSList, fits_data: list, fits_datasets_chi2_nsigma_deviation: list, is_weighted: bool, ) -> MulticlosureNsigma: """ Returns a table (dataframe) containing n_sigma values. Index: dataset names, Columns: Level 1 seeds (filterseed). Parameters ---------- fits: NSList List of fits. fits_data: list List of data for each fit. fits_datasets_chi2_nsigma_deviation: list List of n_sigma values for each dataset for each fit. is_weighted: bool Used to keep track of whether the fit was weighted. Returns ------- MulticlosureNsigma """ res_dict = {} for fit, input_data, nsigma_data in zip(fits, fits_data, fits_datasets_chi2_nsigma_deviation): res_dict[fit.as_input()["closuretest"]["filterseed"]] = d = {} for ds, nsigma in zip(input_data.datasets, nsigma_data): d[ds.name] = nsigma return MulticlosureNsigma(is_weighted=is_weighted, nsigma_table=pd.DataFrame(res_dict))
dataspecs_multiclosurefits_nsigma = collect("multiclosurefits_nsigma", ("dataspecs",)) """ Collect the multiclosurefits_nsigma over dataspecs. """
[docs]@dataclasses.dataclass class NsigmaAlpha: """ Dataclass storing the set 1 values (can be used both for the set 1 and its complement). Attributes ---------- alpha_dict: dict A dictionary containing the set 1 alpha values. is_weighted: bool Whether the fit was weighted. """ alpha_dict: dict is_weighted: bool
[docs]def def_of_nsigma_alpha( multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str, complement: bool = False ) -> NsigmaAlpha: """ Defines how the set 1 alpha values are computed. It allows to compute both the set 1 and its complement. Parameters ---------- multiclosurefits_nsigma: pd.DataFrame The nsigma table. weighted_dataset: str The name of the weighted dataset. complement: bool, default=False Whether to compute the complement set 1 alpha values. Returns ------- NsigmaAlpha """ df = multiclosurefits_nsigma.nsigma_table nsigma_values = df[df.index == weighted_dataset].values.flatten() set1_alpha = {} for z_alpha in Z_ALPHA_RANGE: if complement: fit_idxs = np.where(nsigma_values < z_alpha)[0] else: fit_idxs = np.where(nsigma_values > z_alpha)[0] # save it as set to allow for easy intersection with other sets set1_alpha[z_alpha] = set(df.columns[fit_idxs]) return NsigmaAlpha(alpha_dict=set1_alpha, is_weighted=multiclosurefits_nsigma.is_weighted)
[docs]def nsigma_alpha(multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str) -> NsigmaAlpha: """ Computes the set 1 alpha values. """ return def_of_nsigma_alpha(multiclosurefits_nsigma, weighted_dataset, complement=False)
dataspecs_nsigma_alpha = collect("nsigma_alpha", ("dataspecs",)) """ Collect set 1 alpha over dataspecs. """
[docs]def comp_nsigma_alpha(multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str) -> NsigmaAlpha: """ Computes the complement set 1 alpha values. """ return def_of_nsigma_alpha(multiclosurefits_nsigma, weighted_dataset, complement=True)
dataspecs_comp_nsigma_alpha = collect("comp_nsigma_alpha", ("dataspecs",)) """ Collect complement set 1 alpha over dataspecs. """
[docs]def set_1(dataspecs_nsigma_alpha: list) -> dict: r""" Returns the set 1 alpha values, these are defined as S_1 = {j | n_{\sigma}^{j} > Z_{\alpha}} where j is the index of the fit and n_{\sigma}^{j} is the n-sigma value computed for fit j. Parameters ---------- dataspecs_nsigma_alpha: list List of NsigmaAlpha dataclasses. Returns ------- dict """ for dataspec_nsigma in dataspecs_nsigma_alpha: if not dataspec_nsigma.is_weighted: return dataspec_nsigma.alpha_dict
[docs]def set_2(dataspecs_nsigma_alpha: list) -> dict: r""" Same as the set 1 alpha values, but for the weighted fits. S_2 = {i | n_{weighted, \sigma}^{i} > Z_{\alpha}} where i is the index of the fit and n_{weighted, \sigma}^{i} is the n-sigma value computed on the weighted dataset for fit i. Parameters ---------- dataspecs_nsigma_alpha: list List of NsigmaAlpha dataclasses. Returns ------- dict """ for dataspec_nsigma in dataspecs_nsigma_alpha: if dataspec_nsigma.is_weighted: return dataspec_nsigma.alpha_dict
[docs]def comp_set_1(dataspecs_comp_nsigma_alpha: list) -> dict: """ Returns the complement set 1 alpha values. """ for dataspec_nsigma in dataspecs_comp_nsigma_alpha: if not dataspec_nsigma.is_weighted: return dataspec_nsigma.alpha_dict
[docs]def def_set_3( dataspecs_multiclosurefits_nsigma: list, weighted_dataset: str, complement: bool = False ) -> dict: """ Defines how the set 3 values are computed. It allows to compute both the set 3 and its complement. Parameters ---------- dataspecs_multiclosurefits_nsigma: list List of MulticlosureNsigma dataclasses. weighted_dataset: str The name of the weighted dataset. complement: bool, default=False Whether to compute the complement set 3 alpha values. Returns ------- dict """ # Order the dataspecs so that the weighted dataset is the first one dataspecs_mct = [] for mct_nsigma in dataspecs_multiclosurefits_nsigma: if mct_nsigma.is_weighted: dataspecs_mct.insert(0, mct_nsigma) else: dataspecs_mct.append(mct_nsigma) df_weight = dataspecs_mct[0].nsigma_table df_weight = df_weight[df_weight.index != weighted_dataset] df_ref = dataspecs_mct[1].nsigma_table df_ref = df_ref[df_ref.index != weighted_dataset] # ensure that weighted and reference dfs have the columns in the same order # (needed to properly compare fits) df_ref = df_ref[df_weight.columns] set3_alpha = {} for z_alpha in Z_ALPHA_RANGE: if complement: columns_bools = np.any((df_weight - df_ref).values < z_alpha, axis=0) else: columns_bools = np.any((df_weight - df_ref).values > z_alpha, axis=0) set3_alpha[z_alpha] = set(df_weight.columns[columns_bools]) return set3_alpha
[docs]def set_3(dataspecs_multiclosurefits_nsigma: list, weighted_dataset: str) -> dict: r""" Computes the set 3 alpha values. The set 3 is defined as: S_3 = {i | n_{weighted, \sigma}^{i} - n_{ref, \sigma}^{i}> + Z_{\alpha}} where the n-sigma is computed on all datasets that are not the weighted dataset. Moreover if for a fit i any dataset has a n-sigma value greater than Z_{\alpha}, then the fit i is included in the set. """ return def_set_3(dataspecs_multiclosurefits_nsigma, weighted_dataset, complement=False)
[docs]def probability_inconsistent(set_1, set_2, set_3, comp_set_1, n_fits, weighted_dataset): """ The set of inconsistent fits can be defined in different ways, two possible cases are: 1. C_2: (S_1 intersect S_2) union (S_3) 2. C_3 = S_1 union (~S_1 intersect S_3) The probability of a dataset being inconsistent is defined as: P(inconsistent) = |I_alpha| / N where N is the total number of fits. """ C_2 = [] C_3 = [] for z_alpha in set_3.keys(): set_3_inters_1 = set_3[z_alpha].intersection(set_1[z_alpha]) s2 = set_2[z_alpha] set_tagged_fits = set_3_inters_1.union(s2) C_2.append(len(set_tagged_fits) / n_fits) set_tagged_fits = set_tagged_fits.union(comp_set_1[z_alpha].intersection(set_3[z_alpha])) C_3.append(len(set_tagged_fits) / n_fits) return C_3, C_2