Source code for validphys.closuretest.multiclosure_nsigma

r"""
This module contains the functions used in Sec. 4 of paper: arXiv: 2503.17447

`set_1`, `set_2`, and `set_3` correspond to \(\S_1\), \(\S_2\), and \(\S_3\) in Eq. 4.3, 4.6, and 4.7.
"""

import dataclasses

import numpy as np
import pandas as pd
from scipy.stats import norm

import reportengine
from reportengine import collect
from validphys.closuretest.closure_checks import (
    check_fits_areclosures,
    check_fits_underlying_law_match,
)

Z_ALPHA_RANGE = norm.ppf(1 - np.linspace(0, 0.5, 100))
"""
Quantile range for computing the true positive rate and true negative rate.
"""


[docs]@dataclasses.dataclass
class MulticlosureNsigma:
    """
    Dataclass containing nsigma values for all datasets and fits,
    also used to keep track on whether the multiclosure fit is weighted or not.

    Attributes
    ----------
    nsigma_table: pd.DataFrame
        A table containing n_sigma values.
    is_weighted: bool
        Whether the fit was weighted.
    """

    nsigma_table: pd.DataFrame
    is_weighted: bool


[docs]@check_fits_areclosures
@check_fits_underlying_law_match
def multiclosurefits_nsigma(
    fits: reportengine.namespaces.NSList,
    fits_data: list,
    fits_datasets_chi2_nsigma_deviation: list,
    is_weighted: bool,
) -> MulticlosureNsigma:
    """
    Returns a table (dataframe) containing n_sigma values.
    Index: dataset names, Columns: Level 1 seeds (filterseed).

    Parameters
    ----------
    fits: NSList
        List of fits.
    fits_data: list
        List of data for each fit.
    fits_datasets_chi2_nsigma_deviation: list
        List of n_sigma values for each dataset for each fit.
    is_weighted: bool
        Used to keep track of whether the fit was weighted.

    Returns
    -------
    MulticlosureNsigma
    """
    res_dict = {}
    for fit, input_data, nsigma_data in zip(fits, fits_data, fits_datasets_chi2_nsigma_deviation):
        res_dict[fit.as_input()["closuretest"]["filterseed"]] = d = {}
        for ds, nsigma in zip(input_data.datasets, nsigma_data):
            d[ds.name] = nsigma
    return MulticlosureNsigma(is_weighted=is_weighted, nsigma_table=pd.DataFrame(res_dict))


dataspecs_multiclosurefits_nsigma = collect("multiclosurefits_nsigma", ("dataspecs",))
"""
Collect the multiclosurefits_nsigma over dataspecs.
"""


[docs]@dataclasses.dataclass
class NsigmaAlpha:
    """
    Dataclass storing the set 1 values (can be used both for the set 1 and its complement).

    Attributes
    ----------
    alpha_dict: dict
        A dictionary containing the set 1 alpha values.
    is_weighted: bool
        Whether the fit was weighted.
    """

    alpha_dict: dict
    is_weighted: bool


[docs]def def_of_nsigma_alpha(
    multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str, complement: bool = False
) -> NsigmaAlpha:
    """
    Defines how the set 1 alpha values are computed.
    It allows to compute both the set 1 and its complement.

    Parameters
    ----------
    multiclosurefits_nsigma: pd.DataFrame
        The nsigma table.
    weighted_dataset: str
        The name of the weighted dataset.
    complement: bool, default=False
        Whether to compute the complement set 1 alpha values.

    Returns
    -------
    NsigmaAlpha
    """
    df = multiclosurefits_nsigma.nsigma_table
    nsigma_values = df[df.index == weighted_dataset].values.flatten()
    set1_alpha = {}
    for z_alpha in Z_ALPHA_RANGE:
        if complement:
            fit_idxs = np.where(nsigma_values < z_alpha)[0]
        else:
            fit_idxs = np.where(nsigma_values > z_alpha)[0]
        # save it as set to allow for easy intersection with other sets
        set1_alpha[z_alpha] = set(df.columns[fit_idxs])

    return NsigmaAlpha(alpha_dict=set1_alpha, is_weighted=multiclosurefits_nsigma.is_weighted)


[docs]def nsigma_alpha(multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str) -> NsigmaAlpha:
    """
    Computes the set 1 alpha values.
    """
    return def_of_nsigma_alpha(multiclosurefits_nsigma, weighted_dataset, complement=False)


dataspecs_nsigma_alpha = collect("nsigma_alpha", ("dataspecs",))
"""
Collect set 1 alpha over dataspecs.
"""


[docs]def comp_nsigma_alpha(multiclosurefits_nsigma: pd.DataFrame, weighted_dataset: str) -> NsigmaAlpha:
    """
    Computes the complement set 1 alpha values.
    """
    return def_of_nsigma_alpha(multiclosurefits_nsigma, weighted_dataset, complement=True)


dataspecs_comp_nsigma_alpha = collect("comp_nsigma_alpha", ("dataspecs",))
"""
Collect complement set 1 alpha over dataspecs.
"""


[docs]def set_1(dataspecs_nsigma_alpha: list) -> dict:
    r"""
    Returns the set 1 alpha values, these are defined as

    S_1 = {j | n_{\sigma}^{j} > Z_{\alpha}}

    where j is the index of the fit and n_{\sigma}^{j} is the n-sigma value computed
    for fit j.

    Parameters
    ----------
    dataspecs_nsigma_alpha: list
        List of NsigmaAlpha dataclasses.

    Returns
    -------
    dict
    """
    for dataspec_nsigma in dataspecs_nsigma_alpha:
        if not dataspec_nsigma.is_weighted:
            return dataspec_nsigma.alpha_dict


[docs]def set_2(dataspecs_nsigma_alpha: list) -> dict:
    r"""
    Same as the set 1 alpha values, but for the weighted fits.

    S_2 = {i | n_{weighted, \sigma}^{i} > Z_{\alpha}}

    where i is the index of the fit and n_{weighted, \sigma}^{i} is the n-sigma value computed
    on the weighted dataset for fit i.

    Parameters
    ----------
    dataspecs_nsigma_alpha: list
        List of NsigmaAlpha dataclasses.

    Returns
    -------
    dict
    """
    for dataspec_nsigma in dataspecs_nsigma_alpha:
        if dataspec_nsigma.is_weighted:
            return dataspec_nsigma.alpha_dict


[docs]def comp_set_1(dataspecs_comp_nsigma_alpha: list) -> dict:
    """
    Returns the complement set 1 alpha values.
    """
    for dataspec_nsigma in dataspecs_comp_nsigma_alpha:
        if not dataspec_nsigma.is_weighted:
            return dataspec_nsigma.alpha_dict


[docs]def def_set_3(
    dataspecs_multiclosurefits_nsigma: list, weighted_dataset: str, complement: bool = False
) -> dict:
    """
    Defines how the set 3 values are computed.
    It allows to compute both the set 3 and its complement.

    Parameters
    ----------
    dataspecs_multiclosurefits_nsigma: list
        List of MulticlosureNsigma dataclasses.
    weighted_dataset: str
        The name of the weighted dataset.
    complement: bool, default=False
        Whether to compute the complement set 3 alpha values.

    Returns
    -------
    dict
    """
    # Order the dataspecs so that the weighted dataset is the first one
    dataspecs_mct = []
    for mct_nsigma in dataspecs_multiclosurefits_nsigma:
        if mct_nsigma.is_weighted:
            dataspecs_mct.insert(0, mct_nsigma)
        else:
            dataspecs_mct.append(mct_nsigma)

    df_weight = dataspecs_mct[0].nsigma_table
    df_weight = df_weight[df_weight.index != weighted_dataset]

    df_ref = dataspecs_mct[1].nsigma_table
    df_ref = df_ref[df_ref.index != weighted_dataset]

    # ensure that weighted and reference dfs have the columns in the same order
    # (needed to properly compare fits)
    df_ref = df_ref[df_weight.columns]

    set3_alpha = {}

    for z_alpha in Z_ALPHA_RANGE:
        if complement:
            columns_bools = np.any((df_weight - df_ref).values < z_alpha, axis=0)
        else:
            columns_bools = np.any((df_weight - df_ref).values > z_alpha, axis=0)

        set3_alpha[z_alpha] = set(df_weight.columns[columns_bools])

    return set3_alpha


[docs]def set_3(dataspecs_multiclosurefits_nsigma: list, weighted_dataset: str) -> dict:
    r"""
    Computes the set 3 alpha values. The set 3 is defined as:

    S_3 = {i | n_{weighted, \sigma}^{i} - n_{ref, \sigma}^{i}>  + Z_{\alpha}}

    where the n-sigma is computed on all datasets that are not the weighted dataset.
    Moreover if for a fit i any dataset has a n-sigma value greater than Z_{\alpha}, then
    the fit i is included in the set.
    """
    return def_set_3(dataspecs_multiclosurefits_nsigma, weighted_dataset, complement=False)


[docs]def probability_inconsistent(set_1, set_2, set_3, comp_set_1, n_fits, weighted_dataset):
    """
    The set of inconsistent fits can be defined in different ways, two possible cases are:

    1. C_2: (S_1 intersect S_2) union (S_3)

    2. C_3 = S_1 union (~S_1 intersect S_3)

    The probability of a dataset being inconsistent is defined as:
            P(inconsistent) = |I_alpha| / N
    where N is the total number of fits.
    """

    C_2 = []
    C_3 = []
    for z_alpha in set_3.keys():
        set_3_inters_1 = set_3[z_alpha].intersection(set_1[z_alpha])
        s2 = set_2[z_alpha]

        set_tagged_fits = set_3_inters_1.union(s2)

        C_2.append(len(set_tagged_fits) / n_fits)

        set_tagged_fits = set_tagged_fits.union(comp_set_1[z_alpha].intersection(set_3[z_alpha]))
        C_3.append(len(set_tagged_fits) / n_fits)

    return C_3, C_2