Source code for validphys.pineparser

"""
Loader for the pineappl-based FKTables

The FKTables for pineappl have ``pineappl.lz4`` and can be utilized
directly with the ``pineappl`` cli as well as read with ``pineappl.fk_table``
"""

import logging

import numpy as np
import pandas as pd

from nnpdf_data.commondataparser import EXT, TheoryMeta
from validphys.coredata import FKTableData

log = logging.getLogger(__name__)


[docs]class GridFileNotFound(FileNotFoundError):
    """PineAPPL file for FK table not found."""


[docs]def pineko_yaml(yaml_file, grids_folder):
    """Given a yaml_file, returns the corresponding dictionary and grids.

    The dictionary contains all information and we return an extra field
    with all the grids to be loaded for the given dataset.

    Parameters
    ----------
    yaml_file : pathlib.Path
        path of the yaml file for the given dataset
    grids_folder : pathlib.Path
        path of the grids folder
    check_grid_existence: bool
        if True (default) checks whether the grid exists

    Returns
    -------
    yaml_content: dict
        Metadata prepared for the FKTables
    paths: list(list(path))
        List (of lists) with all the grids that will need to be loaded
    """
    # TODO: the theory metadata can be found inside the commondata metadata
    # however, for the time being, pineappl tables contain this information in the `yamldb` database
    # they should be 100% compatible (and if they are not there is something wrong somewhere)
    # so already at this stage, use TheoryMeta parser to get the metadata for pineappl theories
    # Note also that we need to use this "parser" due to the usage of the name "operands" in the yamldb
    theory_meta = TheoryMeta.parser(yaml_file)
    member_paths = theory_meta.fktables_to_paths(grids_folder)
    return theory_meta, member_paths


def _pinelumi_to_columns(pine_luminosity, hadronic):
    """Makes the pineappl luminosity into the column indices of a dataframe
    These corresponds to the indices of a flattened (14x14) matrix for hadronic observables
    and the non-zero indices of the 14-flavours for DIS

    Parameters
    ----------
        pine_luminosity: list(tuple(int))
            list with a pair of flavours per channel
        hadronic: bool
            flag for hadronic / DIS observables

    Returns
    -------
        list(int): list of labels for the columns
    """
    evol_basis_pids = tuple(
        [22, 100, 21, 200]
        + [200 + n**2 - 1 for n in range(2, 6 + 1)]
        + [100 + n**2 - 1 for n in range(2, 6 + 1)]
    )
    flav_size = len(evol_basis_pids)
    columns = []
    # TODO: Extend this to deal with a generic number of convolutions
    if hadronic:
        for i, j in pine_luminosity:
            idx = evol_basis_pids.index(i)
            jdx = evol_basis_pids.index(j)
            columns.append(flav_size * idx + jdx)
    else:
        # Now for DIS, there is only ONE single PID
        columns = [evol_basis_pids.index(i[0]) for i in pine_luminosity]
    return columns


def _get_convolution_types(convolutions):
    """Get the type of convolutions from for the given FK table.

    Parameters
    ----------
    convolutions: list[pineappl.convolutions.Conv]
        a list of PineAPPL object containing the types of convolutions

    Returns
    -------
        tuple(str): a tuple of string containing whose elements are either
            `UnpolPDF` or `PolPDF`
    """
    # TODO: Extend the following to deal with `time_like` FFs
    convolution_types = []
    for convolution in convolutions:
        if convolution.convolution_types.polarized:
            convolution_types.append("PolPDF")
        else:
            convolution_types.append("UnpolPDF")
    return tuple(convolution_types)


[docs]def get_yaml_information(yaml_file, theorypath):
    """Reads the yaml information from a yaml compound file

    Transitional function: the call to "pineko" might be to some other commondata reader
    that will know how to extract the information from the commondata
    """
    # The pineappl grids are just stored where the fktables would be, "fastkernel"
    grids_folder = theorypath / "fastkernel"
    return pineko_yaml(yaml_file, grids_folder)


[docs]def pineappl_reader(fkspec):
    """
    Receives a fkspec, which contains the path to the fktables that are to be read by pineappl
    as well as metadata that fixes things like conversion factors or apfelcomb flag.
    The fkspec contains also the cfactors which are applied _directly_ to each of the fktables.

    The output of this function is an instance of FKTableData which can be generated from reading
    several FKTable files which get concatenated on the ndata (bin) axis.

    For more information on the reading of pineappl tables:
        https://pineappl.readthedocs.io/en/latest/modules/pineappl/pineappl.html#pineappl.pineappl.PyFkTable

    About the reader:
        Each pineappl table is a 4-dimensional grid with:
            (ndata, active channels, x1, x2)
        for DIS grids x2 will contain one single number.
        The luminosity channels are given in a (flav1, flav2) format and thus need to be converted
        to the 1-D index of a (14x14) luminosity tensor in order to put in the form of a dataframe.

        All grids in pineappl are constructed with the exact same xgrid,
        the active channels can vary and so when grids are concatenated for an observable
        the gaps are filled with 0s.

        The pineappl grids are such that obs = sum_{bins} fk * f (*f) * bin_w
        so in order to use them together with old-style grids (obs = sum_{bins} fk * xf (*xf))
        it is necessary to remove the factor of x and the normalization of the bins.

    About apfelcomb flags in yamldb files:
        old commondata files and old grids have over time been through various iterations while remaining compatibility between each other,
        and fixes and hacks have been incorporated in one or another
        for the new theory to be compatible with old commpondata it is necessary
        to keep track of said hacks (and to apply conversion factors when required)
    NOTE: both conversion factors and apfelcomb flags will be eventually removed.

    Returns
    -------
        validphys.coredata.FKTableData
            an FKTableData object containing all necessary information to compute predictions
    """
    from pineappl.fk_table import FkTable

    pines = []
    for fk_path in fkspec.fkpath:
        try:
            pines.append(FkTable.read(fk_path))
        except BaseException as e:
            # Catch absolutely any error coming from pineappl, give some info and immediately raise
            log.error(f"Fatal error reading {fk_path}")
            raise e

    cfactors = fkspec.load_cfactors()

    # Extract metadata from the first grid
    pine_rep = pines[0]

    # Get the convolution types for this FK table
    convolutions = pine_rep.convolutions
    convolution_types = _get_convolution_types(convolutions)

    # Is it hadronic? For the time being, hadronic is defined with `len(convolutions) == 2`.
    # Hadronic could also involve 3 convolutions in processes such as `pp->H` (with FFs).
    # DIS FK table now only contains ONE single convolutions.
    hadronic = len(convolutions) == 2
    # For the time being, allow only hadronic with identical initial-state protons
    if hadronic and (convolutions[0].pid != 2212 or convolutions[1].pid != 2212):
        raise ValueError("Only two identical protons in the initial-state are allowed.")

    # TODO: While now any arbittrary number of convolutions is allowed, for the time being,
    # raise Errors when the number of convolutions is more than 2.
    if len(convolutions) > 2:
        raise ValueError("Only FK tables with maximum 2 convolutions are allowed.")

    Q0 = np.sqrt(pine_rep.fac0())
    xgrid = np.array([])
    for pine in pines:
        xgrid = np.union1d(xgrid, pine.x_grid())
    xi = np.arange(len(xgrid))
    protected = False

    # Process the shifts and normalizations (if any),
    # shifts is a dictionary with {fktable_name: shift_value}
    # normalization instead {fktable_name: normalization to apply}
    # since this parser doesn't know about operations, we need to convert it to a list
    # then we just iterate over the fktables and apply the shift in the right order
    shifts = fkspec.metadata.shifts
    normalization_per_fktable = fkspec.metadata.normalization
    fknames = [i.name.replace(f".{EXT}", "") for i in fkspec.fkpath]
    if cfactors is not None:
        cfactors = dict(zip(fknames, cfactors))

    # fktables in pineapplgrid are for obs = fk * f while previous fktables were obs = fk * xf
    # prepare the grid all tables will be divided by
    if hadronic:
        xdivision = np.prod(np.meshgrid(xgrid, xgrid), axis=0)
    else:
        xdivision = xgrid[:, np.newaxis]

    partial_fktables = []
    ndata = 0
    for fkname, p in zip(fknames, pines):
        # Start by reading possible cfactors if cfactor is not empty
        cfprod = 1.0
        if cfactors is not None:
            for cfac in cfactors.get(fkname, []):
                cfprod *= cfac.central_value

        # Read the table, remove bin normalization and apply cfactors
        raw_fktable = (cfprod * p.table().T / p.bin_normalizations()).T
        # If it is a DIS FK table then we need to expand the dimension
        if not hadronic:
            raw_fktable = np.expand_dims(raw_fktable, axis=-1)
        n = raw_fktable.shape[0]

        # Apply possible per-fktable fixes
        if shifts is not None:
            ndata += shifts.get(fkname, 0)

        if normalization_per_fktable is not None:
            raw_fktable = raw_fktable * normalization_per_fktable.get(fkname, 1.0)

        # Add empty points to ensure that all fktables share the same x-grid upon convolution
        missing_x_points = np.setdiff1d(xgrid, p.x_grid(), assume_unique=True)
        for x_point in missing_x_points:
            miss_index = list(xgrid).index(x_point)
            raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=2)
            if hadronic:
                raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=3)
        # Check conversion factors and remove the x* from the fktable
        raw_fktable *= fkspec.metadata.conversion_factor / xdivision

        # Create the multi-index for the dataframe
        # for optimized pineappls different grids can potentially have different indices
        # so they need to be indexed separately and then concatenated only at the end
        lumi_columns = _pinelumi_to_columns(p.channels(), hadronic)
        lf = len(lumi_columns)
        data_idx = np.arange(ndata, ndata + n)
        if hadronic:
            idx = pd.MultiIndex.from_product([data_idx, xi, xi], names=["data", "x1", "x2"])
        else:
            idx = pd.MultiIndex.from_product([data_idx, xi], names=["data", "x"])

        # Now concatenate (data, x1, x2) and move the flavours to the columns
        df_fktable = raw_fktable.swapaxes(0, 1).reshape(lf, -1).T
        partial_fktables.append(pd.DataFrame(df_fktable, columns=lumi_columns, index=idx))

        ndata += n

    # Finallly concatenate all fktables, sort by flavours and fill any holes
    sigma = pd.concat(partial_fktables, sort=True, copy=False).fillna(0.0)

    # Check whether this is a 1-point normalization fktable and, if that's the case, protect!
    if fkspec.metadata.operation == "RATIO" and len(pines) == 1:
        # it _might_ be, check whether it is the divisor fktable
        divisor = fkspec.metadata.FK_tables[-1][0]
        name = fkspec.fkpath[0].name.replace(f".{EXT}", "")

        if np.allclose(sigma.loc[1:], 0.0):
            # Old denominator fktables were filled with 0s beyond the first point
            # and they needed to be post-processed to repeat the same point many time
            # Instead, drop everything beyond the 1st point (used 0:0 to keep the same kind of df)
            sigma = sigma.loc[0:0]
            ndata = 1

        if ndata == 1:
            # There's no doubt
            protected = divisor == name

    return FKTableData(
        sigma=sigma,
        ndata=ndata,
        Q0=Q0,
        convolution_types=convolution_types,
        metadata=fkspec.metadata,
        hadronic=hadronic,
        xgrid=xgrid,
        protected=protected,
    )