Source code for validphys.paramfits.dataops

"""
dataops.py

This module implements the core functionality of the paramfits package,
currently focused on the αs determination.
It computes various statistics and formats the data in a form suitable to
be consumed by plotting functions.
"""
import logging
import warnings
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as stats

from reportengine import collect
from reportengine.floatformatting import format_error_value_columns, ValueErrorTuple, format_number
from reportengine.checks import make_argcheck, CheckError, check_positive, check_not_empty
from reportengine.table import table

from validphys.checks import (
    check_fits_different,
    check_dataspecs_fits_different,
    check_speclabels_different,
)

log = logging.getLogger(__name__)


[docs]
class StandardSampleWrapper:
    """A class that holds a flat array of data, and has 'location' and
    'scale' properties, which are the mean and the standard deviation.
    The purpose of the class is to explicitly disallow calling np.mean
    and np.std on the result while preserving functional backward
    compatibility with the runcards."""
    def __init__(self, data):
        self.data = data

    @property
    def location(self):
        return np.mean(self.data)

    @property
    def scale(self):
        return np.std(self.data)



[docs]
class RobustSampleWrapper:
    """Similar to ``StandardSampleWrapper``, but location and scale
    are implemented as the median and the 68% interval respectively."""
    def __init__(self, data):
        self.data = data

    @property
    def location(self):
        return np.median(self.data)

    @property
    def scale(self):
        return (np.diff(np.percentile(self.data, [15.87, 84.13]))).item()/2




[docs]
def get_parabola(asvals, chi2vals):
    """Return the three coefficients of a parabola χ²(αs) given a set of
    asvals and a set of χ² values.
    Only the finite χ² values are taken into account."""
    chi2vals = np.ravel(chi2vals)
    filt =  np.isfinite(chi2vals)
    return np.polyfit(np.asarray(asvals)[filt], chi2vals[filt], 2)


#TODO: Export the total here. Not having it is causing huge pain elsewhere.

[docs]
@table
@check_fits_different
def fits_matched_pseudoreplicas_chi2_table(fits, fits_computed_pseudoreplicas_chi2):
    """Collect the chi^2 of the pseudoreplicas in the fits a single table,
    groped by nnfit_id.
    The columns come in two levels, fit name and (total chi², n).
    The indexes also come in two levels: nnfit_id and experiment name."""
    return pd.concat(fits_computed_pseudoreplicas_chi2, axis=1, keys=map(str,fits))



[docs]
@table
@check_dataspecs_fits_different
def dataspecs_matched_pseudoreplicas_chi2_table(
        dataspecs_fit, dataspecs_computed_pseudoreplicas_chi2):
    """Like ``fits_matched_pseudoreplicas_chi2_table`` but for arbitrary dataspecs"""
    return fits_matched_pseudoreplicas_chi2_table(dataspecs_fit, dataspecs_computed_pseudoreplicas_chi2)


@make_argcheck
def _check_badcurves(badcurves):
    options = ['discard', 'minimum', 'allminimum']
    if badcurves not in options:
        raise CheckError(f"badcurves must be one of {options}",
                         badcurves, options)




def _discard_sparse_curves(fits_replica_data_correlated,
        max_ndiscarded):
    """Return a table like `fits_replica_data_correlated` where the replicas
    with too many discarded points have been filtered out."""

    df = fits_replica_data_correlated

    def ap(x):
        x.columns = x.columns.droplevel(0)
        return (x['chi2'])
    table = df.groupby(axis=1, level=0).apply(ap)
    filt = table.isnull().sum(axis=1, min_count=1) < max_ndiscarded

    table = table[filt]
    return table, filt

@make_argcheck
def _check_discarded_string(max_ndiscarded):
    arg = max_ndiscarded
    if isinstance(arg,str):
        if arg != 'auto':
            raise CheckError("Expecting string to be 'auto'")


[docs]
@_check_discarded_string
def discarded_mask(
    fits_replica_data_correlated_for_total,
    fits_as,
    max_ndiscarded:(int,str)='auto',
    autodiscard_confidence_level:float=0.99,
    trim_ndistant:int=0):

    """Return a table like `fits_replica_data_correlated` where the replicas
    with too many discarded points have been filtered out.

    autodiscard_confidence_level is the student-T confidence level. Is normalised to 1
    and only is used if max_ndiscarded is set to 'auto'

    The automated discarding is done by estimating the uncertainty on the uncertainty by bootstrapping.

    The function returns a mask to be applied in fits_replica_data_with_discarded_replicas"""

    df = fits_replica_data_correlated_for_total[0]

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        estimate = parabolic_as_determination(fits_as,df)
    best_as = estimate.location
    dist_best_as = -np.abs(best_as - fits_as)
    to_remove = np.argpartition(dist_best_as, trim_ndistant)[:trim_ndistant]
    as_mask = np.ones(df.shape[1], dtype=bool)
    as_mask[to_remove] = False



    if isinstance(max_ndiscarded,int):
        return _discard_sparse_curves(df,max_ndiscarded)[1], as_mask

    else:
        best_error = np.inf
        ndiscarded = range(len(fits_as),0,-1)
        for i in range(len(ndiscarded),0,-1):

            tablefilt_total, auto_filt = _discard_sparse_curves(df,ndiscarded[i-1])
            least_points = tablefilt_total.notnull().sum(axis=1, min_count=1).min()

            #Number of points that pass the cuts
            size = np.sum(auto_filt)

            #We can only fit a parabola with 3 points.
            #Use a fouth to have in principle some error estimate.
            if least_points > 3:
                #Apply as mask before fitting
                tablefilt_total = tablefilt_total.copy()
                tablefilt_total.iloc[:,~as_mask] = np.NAN
                parabolas = parabolic_as_determination(fits_as,tablefilt_total).data
                bootstrap_est = np.random.choice(parabolas,(100000,size)).std(axis=1).std()
            else:
                bootstrap_est = np.inf

            stdT = stats.t.ppf((1-(1-autodiscard_confidence_level)/2), size-1)
            current_err = bootstrap_est*stdT

            if current_err < best_error:
                best_error = current_err
                best_filt = auto_filt

        return best_filt, as_mask



[docs]
def fits_replica_data_with_discarded_replicas(
        discarded_mask,
        fits_replica_data_correlated):
    """Applies mask from discarded_mask to dataframes"""
    curve_mask, as_mask = discarded_mask

    discarded_replicas = fits_replica_data_correlated[curve_mask].copy()
    #Set these to Nan instead to masking them away in order to not break
    #all the apis that match this with fits_as.
    discarded_replicas.iloc[:,~as_mask] = np.NAN
    return discarded_replicas



def _parabolic_as_minimum_and_coefficient(fits_as,
        fits_replica_data_with_discarded_replicas,
        badcurves='discard'):
    """This implements the fitting in ``parabolic_as_determination``.
    Returns the minimum and the set of locations."""
    alphas = fits_as

    table = fits_replica_data_with_discarded_replicas.values

    minimums = []
    quadratic = []
    asarr = np.asarray(alphas)
    for row in table:
        filt = np.isfinite(row)
        if not filt.any():
            continue
        a,b,c = np.polyfit(asarr[filt], row[filt], 2)
        quadratic.append(a)
        if badcurves == 'allminimum':
            minimums.append(asarr[filt][np.argmin(row[filt])])
        elif a>0:
            minimums.append(-b/2/a)
        elif badcurves == 'discard':
            pass
        elif badcurves == 'minimum':
            minimums.append(asarr[filt][np.argmin(row[filt])])
        else:
            raise RuntimeError("Unknown bad curves.")
    quadratic = np.asarray(quadratic)
    minimums = np.asarray(minimums)
    return minimums, quadratic


def _parabolic_as_determination(fits_as,
        fits_replica_data_with_discarded_replicas,badcurves='discard'):

    return _parabolic_as_minimum_and_coefficient( fits_as,
                   fits_replica_data_with_discarded_replicas, badcurves)[0]


[docs]
def quadratic_as_determination(fits_as,
        fits_replica_data_with_discarded_replicas,badcurves='discard'):

    return _parabolic_as_minimum_and_coefficient(fits_as,
                   fits_replica_data_with_discarded_replicas, badcurves)[1]




@make_argcheck
def _check_as_transform(as_transform):
    values = (None, 'log', 'exp', 'logshift')
    if not as_transform in values:
        raise CheckError(f"The allowed valued values for "
                         f"as_transform are {values}", str(as_transform),
                         values[1:])

@make_argcheck
def _check_parabolic_as_statistics(parabolic_as_statistics):
    values = 'standard', 'robust'
    if parabolic_as_statistics not in values:
        raise CheckError('The allowed values for'
                f'`parabolic_as_statistics` are {values}',
                parabolic_as_statistics, values)



[docs]
@_check_badcurves
@_check_as_transform
@_check_parabolic_as_statistics
def parabolic_as_determination(
        fits_as,
        fits_replica_data_with_discarded_replicas,
        badcurves='discard',
        as_transform:(str, type(None))=None,
        parabolic_as_statistics:str='standard'):
    """Return the minima for alpha_s corresponding to the fitted curves.
    ``badcuves`` specifies what to do with concave replicas and can be one of
    'discard', 'allminimum'
    (which takes the minimum points
    for *all* the replicas without fitting a parabola) or
    'minimum' (which takes the minimum value for the concave replicas).

    If ``parabolic_as_statistics`` is ``"standard"``, means and standard
    deviations will be used to compute statstics. Otherwise, if it is
    ``"robust"``, medians and 68% intervals will be used.

    as_transform can be None, 'log', 'logshift' (``log(1+αs)``) or
    'exp' and is applied to the as_values and then reversed for the
    minima.
    """
    if as_transform == 'log':
        fits_as = np.log(fits_as)
    elif as_transform == 'logshift':
        fits_as = np.log(fits_as + 1)
    elif as_transform == 'exp':
        fits_as = np.exp(fits_as)
    minimums = _parabolic_as_determination(
                   fits_as,
                   fits_replica_data_with_discarded_replicas, badcurves)
    # Invert the transform
    if as_transform == 'log':
        minimums = np.exp(minimums)
    elif as_transform == 'exp':
        minimums = np.log(minimums)
    elif as_transform == 'logshift':
        minimums = np.exp(minimums - 1)
    if parabolic_as_statistics == 'standard':
        res = StandardSampleWrapper(minimums)
    elif parabolic_as_statistics == 'robust':
        res = RobustSampleWrapper(minimums)
    else:
        raise RuntimeError("Unknown `parabolic_as_statistics`.")
    return res



[docs]
def as_central_parabola(
        fits_as,
        fits_total_chi2):
    """Return the coefficients corresponding to the parabolic fit to the
    minimum of the pseudoreplicas"""
    return get_parabola(fits_as, fits_total_chi2)


as_datasets_central_parabolas = collect(
        'as_central_parabola', ['fits_central_chi2_by_dataset_item'])

central_by_dataset_suptitle = collect('suptitle', ['fits_central_chi2_by_dataset_item'])
dataspecs_central_by_dataset_suptitle = collect('central_by_dataset_suptitle', ['dataspecs'])

central_by_dataset_ndata = collect(
    'ndata',
    ['fits_central_chi2_by_dataset_item',]
)
dataspecs_central_by_dataset_ndata = collect('central_by_dataset_ndata', ['dataspecs'])

by_dataset_suptitle = collect(
    'suptitle',
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

dataspecs_dataset_suptitle = collect('by_dataset_suptitle', ['dataspecs'])


by_dataset_ndata = collect(
    'ndata',
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

dataspecs_dataset_ndata = collect('by_dataset_ndata', ['dataspecs'])



[docs]
@table
def derivative_dispersion_table(
                                as_datasets_central_parabolas,
                                fits_as,
                                central_by_dataset_suptitle,
                                as_determination_from_central_chi2_for_total):
    best_as = np.ravel(as_determination_from_central_chi2_for_total)[0]
    d = {}
    for label, p in zip(central_by_dataset_suptitle, as_datasets_central_parabolas):
        d[label] = np.polyval(np.polyder(p), best_as)

    s = pd.Series(d)
    s['SUM'] = np.sum(s)
    s['SUM QUADRATURE'] = np.sqrt(np.sum(s**2))
    res = pd.DataFrame(s, columns=['Derivative'])

    return res


dataspecs_as_central_parabolas = collect('as_datasets_central_parabolas', ['dataspecs'])


[docs]
def dataspecs_as_central_parabolas_map(
        dataspecs_speclabel,
        dataspecs_as_central_parabolas,
        dataspecs_central_by_dataset_suptitle,
        dataspecs_central_by_dataset_ndata):
    """Return a dict-like datastucture with the central chi² of the form:

        d[dataset_name][dataspec] = parabola_coefficients/ndata

    for all dataset items and dataspecs.
    """
    res = defaultdict(dict)
    for label, parabolas, dsnames, ndatas in zip(
            dataspecs_speclabel,
            dataspecs_as_central_parabolas,
            dataspecs_central_by_dataset_suptitle,
            dataspecs_central_by_dataset_ndata):
        for parabola, dsname, ndata in zip(parabolas, dsnames, ndatas):
            res[dsname][label] = parabola/np.asarray(ndata)
    return res


def _aic(residuals, n, k):
    return 2*k + n*np.log(residuals) + 2*k*(k+1)/(n-k-1)


[docs]
@table
def compare_aic(fits_as, fits_replica_data_with_discarded_replicas, suptitle):
    """Compare the Akaike information criterion (AIC) for a
    parabolic and a cubic fit. Note that
    this does **not** yield the actual AIC score, but only the piece
    necessary to compare least squared fit (i.e. assuming
    iid gaussian noise for all points). This is:

        2*k + n*log(sum(residuals squared))

    The mean and standard deviation are taken across curves.
    Note that this always uses the *discard* criterion:
    That is, it ignores the curves that have no minimum."""
    alphas = fits_as
    asarr = np.asarray(alphas)

    aic2s = []
    aic3s = []

    table = fits_replica_data_with_discarded_replicas.values
    for row in table:
        filt = np.isfinite(row)
        asfilt = asarr[filt]
        rowfilt = row[filt]
        n = len(rowfilt)

        p2, res2, *stuff = np.polyfit(asfilt, rowfilt, 2, full=True)
        if p2[0] <= 0:
            pass
            #log.warning(f"Concave parabola computing AIC in {suptitle}")
        else:
            aic2 = _aic(res2, n, k=4)
            aic2s.append(aic2)

        p3, res3, *stuff = np.polyfit(asfilt, rowfilt, 3, full=True)

        extrema = np.roots(np.polyder(p3))
        #Cast away the zero complex part
        candidates = np.real(extrema[np.isreal(extrema)])
        if not len(candidates):
            pass
            #log.warning(f"Bad cubic minimum computing AIC in {suptitle}")
        else:
            aic3 = _aic(res3, n, k=5)
            aic3s.append(aic3)
    v2, e2 = np.mean(aic2s), np.std(aic2s)
    v3, e3 = np.mean(aic3s), np.std(aic3s)

    qp = "Quadratic polynomial"
    cp = "Cubic polynomial"

    df = pd.DataFrame({'mean': {qp: v2, cp: v3}, 'error': {qp: e2, cp:e3},
                       'n minima':{qp: len(aic2s), cp: len(aic3s)}},
                      columns=['mean', 'error', 'n minima'])
    format_error_value_columns(df, 'mean', 'error', inplace=True)
    return df




[docs]
def as_determination_from_central_chi2(fits_as, fits_total_chi2):
    """Return the alpha_s from the minimum chi² and the Delta_chi²=1 error
    from a quadratic fit to the total chi²."""
    alphas = fits_as
    chi2s = np.ravel(fits_total_chi2)
    a,b,c = np.polyfit(alphas, chi2s, 2)
    if a<=0:
        log.error("Found non convex parabola when computing the quadratic fit.")
        return np.nan, np.nan
    return ValueErrorTuple(-b/(2*a), 1/(np.sqrt(a)))



[docs]
def parabolic_as_determination_with_tag(parabolic_as_determination, suptitle):
    """Convenience function to collect the arguments together. It is an identity"""
    return parabolic_as_determination, suptitle



[docs]
def quadratic_as_determination_with_tag(quadratic_as_determination, suptitle):
    """Convenience function to collect the arguments together. It is an identity"""
    return quadratic_as_determination, suptitle



[docs]
def as_determination_from_central_chi2_with_tag(
        as_determination_from_central_chi2, suptitle):
    """Convenience function to collect the arguments together. It is an identity"""

    return as_determination_from_central_chi2, suptitle



as_datasets_pseudoreplicas_chi2 = collect(
    parabolic_as_determination_with_tag,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

as_datasets_central_chi2 = collect(
    as_determination_from_central_chi2_with_tag,
    ['fits_central_chi2_by_dataset_item']
)

parabolic_as_determination_for_total = collect(parabolic_as_determination,
                                      ['matched_pseudoreplicas_for_total'])

as_determination_from_central_chi2_for_total = collect(
        as_determination_from_central_chi2, ['fits_central_chi2_for_total'])


quadratic_datasets_pseudoreplicas_chi2 = collect(
    quadratic_as_determination_with_tag,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

dataspecs_parabolic_as_determination_for_total = collect(
        'parabolic_as_determination_for_total', ['dataspecs'])


[docs]
@check_positive('nresamplings')
def bootstrapping_stats_error(parabolic_as_determination, nresamplings:int=100000, suptitle=""):
    """Compute the bootstrapping uncertainty of the distribution of
    determinations of as, by resampling the list of points with replacement
    from the original sampling distribution `nresamplings` times
    and then computing the standard deviation of the means."""
    distribution = parabolic_as_determination.data
    shape = (nresamplings, len(distribution))
    if not len(distribution):
        log.error("Cannot compute stats error. Empty data.")
        return np.nan
    return np.random.choice(distribution, shape).mean(axis=1).std()



[docs]
@check_positive('nresamplings')
def bootstrapping_stats_error_on_the_error(parabolic_as_determination, nresamplings:int=100000, suptitle=""):
    """Compute the bootstrapping uncertainty of standard deviation on the parabolic determination."""
    distribution = parabolic_as_determination.data
    shape = (nresamplings, len(distribution))
    if not len(distribution):
        log.error("Cannot compute stats error. Empty data.")
        return np.nan
    return np.random.choice(distribution, shape).std(axis=1).std()




[docs]
@check_positive('nresamplings')
def half_sample_stats_error(parabolic_as_determination, nresamplings:int=100000):
    """Like the bootstrapping error, but using only half og the data"""
    sample = parabolic_as_determination.data
    distribution = sample[:len(sample)//2]
    if not len(distribution):
        log.error("Cannot compute half stats. Too few data")
        return np.nan
    shape = (nresamplings, len(distribution))
    return np.random.choice(distribution, shape).mean(axis=1).std()





as_datasets_bootstrapping_stats_error = collect(bootstrapping_stats_error,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

as_datasets_bootstrapping_stats_error_on_the_error = collect(
    bootstrapping_stats_error_on_the_error,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)

as_datasets_half_sample_stats_error = collect(half_sample_stats_error,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)


#Don't write complicated column names everywhere
ps_mean = "pseudoreplica mean"
ps_error = "pseudoreplica error"
ps_stat_error = "pseudoreplica stat"
ps_half_stat_error = "pseudoreplica halfstat"
stats_ratio = r"$\frac{halfstat}{stat}/\sqrt 2$"
n = 'n'

stats_halfone = "cv selecting one half of the replicas"
err_halfone = "err selecting one half of the replicas"

stats_halfother = "cv selecting other half of the replicas"
err_halfonother = "err selecting other half of the replicas"

stats_err_err = "Stat error on the error"

ps_cols = (ps_mean, ps_error ,n, ps_stat_error, ps_half_stat_error,
           stats_halfone, err_halfone, stats_halfother, err_halfonother,
           stats_err_err)


cv_mean = "central mean"
cv_error = "central error"


[docs]
def pseudoreplicas_stats_error(
        as_datasets_pseudoreplicas_chi2,
        as_datasets_bootstrapping_stats_error,
        as_datasets_bootstrapping_stats_error_on_the_error,
        as_datasets_half_sample_stats_error):
    """Return a dictionary (easily convertible to a DataFrame) with the mean,
    error and the measures of statistical error for each dataset."""
    d = defaultdict(dict)

    for (distribution, tag), statserr, staterrerr, halfstaterr in zip(
                as_datasets_pseudoreplicas_chi2,
                as_datasets_bootstrapping_stats_error,
                as_datasets_bootstrapping_stats_error_on_the_error,
                as_datasets_half_sample_stats_error):
        d[ps_mean][tag] = distribution.location
        d[n][tag] = len(distribution.data)
        d[ps_error][tag] = distribution.scale
        d[ps_stat_error][tag] = statserr
        d[ps_half_stat_error][tag] = halfstaterr
        d[stats_ratio][tag] = halfstaterr/statserr/np.sqrt(2)
        d[stats_err_err][tag] = staterrerr

        ldh = len(distribution.data)//2
        onehalf = distribution.data[:ldh]
        otherhalf = distribution.data[ldh:]
        d[stats_halfone][tag] = np.mean(onehalf)
        d[err_halfone][tag] = np.std(onehalf)
        d[stats_halfother][tag] = np.mean(otherhalf)
        d[err_halfonother][tag] = np.std(otherhalf)

    return dict(d)


dataspecs_pseudoreplica_stats_error = collect(pseudoreplicas_stats_error, ['dataspecs'])

#TODO: This is deprecated FAPP

[docs]
@make_argcheck
def check_dataset_items(dataset_items, dataspecs_dataset_suptitle):
    """Check that the dataset_items are legit."""
    if dataset_items is None:
        return
    try:
        s = set(dataset_items)
    except Exception as e:
        raise CheckError(f'dataset_items must be a list of strings: {e}') from e

    flat = [item for l in dataspecs_dataset_suptitle for item in l]
    d = s - set(flat)
    if d:
        raise CheckError(f"The following dataset_items are unrecognized: {d}")





[docs]
def compare_determinations_table_impl(
        pseudoreplicas_stats_error,
        as_datasets_central_chi2):
    """Produce a table by experiment comparing the alpha_S determination
    from pseudoreplicas and from central values."""


    #Use this to get the right sorting
    d  = defaultdict(dict)
    tags = []
    for (cv, error), tag in as_datasets_central_chi2:
        d[cv_mean][tag] = cv
        d[cv_error][tag] = error

        tags.append(tag)

    d.update(pseudoreplicas_stats_error)

    df = pd.DataFrame(d, columns=[*ps_cols, cv_mean, cv_error])
    df = df.loc[tags]
    return df



[docs]
@table
@check_speclabels_different
def dataspecs_stats_error_table(
        dataspecs_pseudoreplica_stats_error,
        dataspecs_dataset_suptitle,
        dataspecs_speclabel,
        dataset_items:(type(None), list) = None,
        ):
    """Return a table with the stats errors of the pseudoreplica determination
    of each dataspec"""
    dfs = []
    for d in dataspecs_pseudoreplica_stats_error:
        df = pd.DataFrame(d, columns=ps_cols)
        format_error_value_columns(df, ps_mean, ps_error)
        format_error_value_columns(df, stats_halfone, err_halfone)
        format_error_value_columns(df, stats_halfother, err_halfonother)

        dfs.append(df)
    table = pd.concat(dfs, axis=1, keys=dataspecs_speclabel)
    if dataset_items is not None:
        table = table.loc[dataset_items]
    return table



[docs]
@table
def dataspecs_stats_error_table_transposed(dataspecs_stats_error_table):
    """Transposed version of dataspecs_stats_error_table for display
    purposes."""
    return dataspecs_stats_error_table.T



[docs]
@table
def compare_determinations_table(compare_determinations_table_impl):
    """Return ``compare_determinations_table_impl`` formatted nicely"""
    df = compare_determinations_table_impl
    format_error_value_columns(df, ps_mean,
         ps_error, inplace=True)
    format_error_value_columns(df, cv_mean,
        cv_error, inplace=True)
    stats_cols = {ps_stat_error, ps_half_stat_error, stats_ratio}
    #Don't fail if/when we remove a table from here
    stats_cols &= set(df.columns)
    stats_cols = list(stats_cols)

    digits2 = functools.partial(format_number, digits=2)
    df[stats_cols] = df[stats_cols].applymap(digits2)
    return df


dataspecs_as_datasets_pseudoreplicas_chi2 = collect('as_datasets_pseudoreplicas_chi2', ['dataspecs'])

quad_as_datasets_pseudoreplicas_chi2 = collect('quadratic_datasets_pseudoreplicas_chi2',['dataspecs'])


#TODO: Deprecate fixup dataset_items earlier

[docs]
@check_speclabels_different
@check_dataset_items
@table
def dataspecs_ndata_table(
            dataspecs_dataset_suptitle,
            dataspecs_dataset_ndata,
            dataspecs_speclabel,
            dataset_items:(list, type(None))=None):
    """Return a table with the same index as
    dataspecs_as_value_error_table_impl with the number of points
    per dataset."""
    d = {}
    for dslabel, datanames, ndatas in zip(dataspecs_speclabel,
                                          dataspecs_dataset_suptitle,
                                          dataspecs_dataset_ndata):
        d[dslabel] = dict(zip(datanames, ndatas))
    df = pd.DataFrame(d)
    if dataset_items is not None:
        df = df.loc[dataset_items]
    return df



[docs]
@check_speclabels_different
@check_dataset_items
def dataspecs_quad_table_impl(
        quad_as_datasets_pseudoreplicas_chi2, dataspecs_speclabel,
        dataspecs_dataset_suptitle,
        dataset_items:(list, type(None)) = None,
        display_n:bool = False,
        ):
    """Return a table with the mean and error of the quadratic coefficient of the parabolic
    determinations across dataspecs. If display_n is True, a column showing the number of points
    will be added to the table"""
    tables = []
    taglist = {}
    if display_n:
        cols = ['mean', 'error', 'n']
    else:
        cols = ['mean', 'error']
    for dets in quad_as_datasets_pseudoreplicas_chi2:
        d = defaultdict(dict)

        for distribution, tag in dets:
            d['mean'][tag] = np.mean(distribution)
            d['error'][tag] = np.std(distribution)

            if display_n:
                d['n'][tag] = len(distribution)
            taglist[tag] = None

        tables.append(pd.DataFrame(d, columns=cols))

    df = pd.concat(tables, axis=1, keys=dataspecs_speclabel)
    if dataset_items is None:
        ordered_keys = list(taglist)

    else:
        ordered_keys = dataset_items

    df = df.loc[ordered_keys]
    return df





[docs]
@check_speclabels_different
@check_dataset_items
def dataspecs_as_value_error_table_impl(
        dataspecs_as_datasets_pseudoreplicas_chi2, dataspecs_speclabel,
        dataspecs_dataset_suptitle,
        dataset_items:(list, type(None)) = None,
        display_n:bool = False,
        ):
    """Return a table with the mean and error of the as determinations across
    dataspecs. If display_n is True, a column showing the number of points
    will be added to the table"""
    tables = []
    #Use the fact that in py3.6 a dict with None values is like an ordered set
    #TODO: A better way to build the dataframe?
    taglist = {}
    if display_n:
        cols = ['mean', 'error', 'n']
    else:
        cols = ['mean', 'error']
    for dets in dataspecs_as_datasets_pseudoreplicas_chi2:
        d = defaultdict(dict)

        for distribution, tag in dets:
            d['mean'][tag] = distribution.location
            d['error'][tag] = distribution.scale


            if display_n:
                d['n'][tag] = len(distribution.data)
            taglist[tag] = None

        tables.append(pd.DataFrame(d, columns=cols))


    df = pd.concat(tables, axis=1, keys=dataspecs_speclabel)
    if dataset_items is None:
        ordered_keys = list(taglist)
    else:
        ordered_keys = dataset_items

    df = df.loc[ordered_keys]



    return df



[docs]
@table
def dataspecs_as_value_error_table(dataspecs_as_value_error_table_impl):
    """Return ``dataspecs_value_error_table_impl`` formatted nicely"""
    def f(x):
        return format_error_value_columns(x, x.columns[0], x.columns[1])
    return dataspecs_as_value_error_table_impl.groupby(level=0, axis=1).apply(f)



[docs]
@table
def dataspecs_as_value_error_table_transposed(dataspecs_as_value_error_table):
    """Transposed version of ``dataspecs_as_value_error_table``.
    Useful for printing"""
    return dataspecs_as_value_error_table.T



[docs]
@table
def dataspecs_quad_value_error_table(dataspecs_quad_table_impl):
    """Return ``dataspecs_value_error_table_impl`` formatted nicely"""
    def f(x):
        return format_error_value_columns(x, x.columns[0], x.columns[1])
    return dataspecs_quad_table_impl.groupby(level=0, axis=1).apply(f)


dataspecs_fits_as = collect('fits_as', ['dataspecs'])

by_dataset_as_chi2 = collect(
        fits_replica_data_with_discarded_replicas,
        ['fits_matched_pseudoreplicas_chi2_by_dataset_item',])

dataspecs_fits_replica_data_with_discarded_replicas = collect(
        'by_dataset_as_chi2', ['dataspecs'])


[docs]
@check_not_empty('dataspecs_dataset_suptitle')
def dataspecs_chi2_by_dataset_dict(dataspecs_dataset_suptitle,
                        dataspecs_fits_replica_data_with_discarded_replicas,
                        dataspecs_fits_as,
                        ):
    """Return a table-like dict with the
    suptitle: [<list of tables>]

    where each table is ``fits_replica_data_with_discarded_replicas`` resolved
    for the given dataset in each of the dataspecs.
    """
    allkeys = set(dataspecs_dataset_suptitle[0])
    for newkeys in dataspecs_dataset_suptitle[1:]:
        newkeys = set(newkeys)
        symdiff = newkeys ^ allkeys
        if symdiff:
            log.warning(f"Some datasets are not "
                        f"common across all dataspecs {symdiff}")
            allkeys |= symdiff

    res = defaultdict(list)
    for keys, values, asvals in zip(dataspecs_dataset_suptitle,
            dataspecs_fits_replica_data_with_discarded_replicas,
            dataspecs_fits_as):
        for k, v in zip(keys, values):
            v.columns = asvals
            res[k].append(v)
        for k in allkeys-set(keys):
            res[k].append(None)
    return res



as_dataset_pseudodata = collect(
    fits_replica_data_with_discarded_replicas,
    ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]
)


[docs]
@table
def as_parabolic_coefficient_table(
        fits_as,
        by_dataset_suptitle,
        as_dataset_pseudodata):
    """Return a table of the parabolic fit of each dataset item, for each
    correlated replica. The index is the correlated_replica index and there
    are four columns for each dataset: 'a', 'b' and 'c' corresponding to the
    parabolic coefficients and 'min', which is ``-b/2/a`` if 'a' is positive,
    and NaN otherwise."""
    alphas = np.asarray(fits_as)
    tb_polys = []
    #Easier to repeat the polyfit code here than to change the format of the
    #various outputs.
    for tb in as_dataset_pseudodata:
        polys = []
        for row in np.asarray(tb):
            filt =  np.isfinite(row)
            if not filt.any():
                polys.append(np.array([np.nan]*3))
                continue
            poly = np.polyfit(alphas[filt], row[filt], 2)
            polys.append(poly)
        frame = pd.DataFrame(polys, index=tb.index, columns=['a','b', 'c'])
        frame['min'] = -frame['b']/2/frame['a']
        frame.loc[frame['a']<0,'min'] = np.nan
        tb_polys.append(frame)
    final_table = pd.concat(tb_polys, axis=1, keys=by_dataset_suptitle)
    return final_table


# Define aliases for functions with spelling mistakes in their names which have now been corrected
# Do this so that old runcards still work
fits_matched_pseudorreplicas_chi2_table = fits_matched_pseudoreplicas_chi2_table
dataspecs_matched_pseudorreplicas_chi2_table = dataspecs_matched_pseudoreplicas_chi2_table
as_datasets_pseudorreplicas_chi2 = as_datasets_pseudoreplicas_chi2
quadratic_datasets_pseudorreplicas_chi2 = quadratic_datasets_pseudoreplicas_chi2
pseudorreplicas_stats_error = pseudoreplicas_stats_error
datasepecs_pseudorreplica_stats_error = dataspecs_pseudoreplica_stats_error
dataspecs_as_datasets_pseudorreplicas_chi2 = dataspecs_as_datasets_pseudoreplicas_chi2
quad_as_datasets_pseudorreplicas_chi2 = quad_as_datasets_pseudoreplicas_chi2
datasepecs_quad_table_impl = dataspecs_quad_table_impl
datasepecs_as_value_error_table_impl = dataspecs_as_value_error_table_impl