"""
#tableloader.py
Load from file some of the tables that validphys produces.
Contrary to `validphys.loader` this module consists of functions that take
absolute paths, and return mostly dataframes.
"""
import functools
import logging
import numpy as np
import pandas as pd
log = logging.getLogger(__name__)
# NOTE:Considering the first columns as index by default (the index_col=0)
# is not particularly sane, but turns out that it is advantageous for backward
# compatibility with the older DataFrame.from_csv method, that was employed
# previously.
sane_load = functools.partial(pd.read_csv, sep='\t', index_col=0)
[docs]
class TableLoaderError(Exception):
"""Errors in the tableloader module."""
pass
[docs]
def parse_data_cv(filename):
"""Useful for reading DataFrames with just one column."""
df = sane_load(filename, index_col=[0, 1, 2])
return df
[docs]
def parse_exp_mat(filename):
"""Parse a dump of a matrix like experiments_covmat."""
df = sane_load(filename, header=[0, 1, 2], index_col=[0, 1, 2])
fixup_header(df, 2, int)
return df
load_experiments_covmat = parse_exp_mat
load_experiments_invcovmat = parse_exp_mat
[docs]
def load_perreplica_chi2_table(filename):
"""Load the output of ``perreplica_chi2_table``."""
df = sane_load(filename, index_col=0, header=[0, 1])
fixup_header(df, 1, int)
return df
[docs]
def load_fits_chi2_table(filename):
"""Load the result of fits_chi2_tavle or similar."""
return sane_load(filename, header=[0, 1], index_col=[0, 1])
[docs]
def load_adapted_fits_chi2_table(filename):
"""Load the fits_chi2_table and adapt it in the way that suits the
``paramfits`` module. That is, return a table with the total chiĀ² and
another with the number of points."""
df = load_fits_chi2_table(filename)
ndatalabel = df.columns[0][1]
dns = df.sort_index(axis=1).loc[:, pd.IndexSlice[:, ndatalabel]]
if not (dns.apply(pd.Series.nunique, axis=1) == 1).all():
raise TableLoaderError("Expecting all entries to have the same ndata")
ndatas = dns.iloc[:, 0]
f = lambda x: x[x.columns[0]] * x[x.columns[1]]
df = df.groupby(axis=1, level=0).apply(f)
df.columns = pd.MultiIndex.from_product([list(df.columns), ['chi2']])
return ndatas, df
[docs]
def set_actual_column_level0(df, new_levels):
"""Set the first level of the index to new_levels. Note:
This is a separate function mostly because it breaks
in every patch update of pandas."""
cols = df.columns
df.columns = cols.set_levels(new_levels, level=0)
# TODO: Find a better place for this function
[docs]
def combine_pseudoreplica_tables(
dfs, combined_names, *, blacklist_datasets=None, min_points_required=2
):
"""Return a table in the same format as perreplica_chi2_table with th e
minimum value of the chiĀ² for each batch of fits."""
for df in dfs:
set_actual_column_level0(df, combined_names)
if blacklist_datasets:
m = np.ones(df.shape[0], dtype=bool)
for it in blacklist_datasets:
dsmask = df.index.get_level_values(1) != it
m &= dsmask
if m.all():
log.warning(f"Did not blacklist any dataset from the list {blacklist_datasets}")
else:
df = df.loc[m]
together = pd.concat(dfs, axis=1, keys=range(len(dfs)))
total = together.loc[(slice(None), 'Total'), :]
total_chis = total.groupby(level=3).sum(min_count=1)
def fixup_min_points(df):
m = (~df.isnull()).sum(axis=1, min_count=1) >= min_points_required
df[df[m].isnull()] = np.inf
return df
# The idea is: Set to inf the nans of the valid curves, so that we select
# the minimum (which is not infinite). Leave the bad nans as nans, so we
# write nan always for those.
total_chis = total_chis.groupby(axis=1, level=1, group_keys=False).apply(fixup_min_points)
# Note, asarray is needed because it ignores NANs otherwise.
argmin = lambda x: pd.Series(np.argmin(np.asarray(x), axis=1), index=x.index)
best_replicas = total_chis.groupby(axis=1, level=1).apply(argmin)
gb = together.groupby(axis=1, level=1)
def inner_select(df, indexes):
return df.iloc[:, indexes[df.name]]
def select_best_replicas(df):
indexes = best_replicas[df.name]
return df.groupby(level=3).apply(inner_select, indexes=indexes)
res = gb.apply(select_best_replicas)
res.index = res.index.droplevel(0)
res.sort_index(inplace=True)
# TODO: Why in earth did I decide to keep this?!
res.columns = pd.MultiIndex.from_product((res.columns, ['chi2']))
return res
# Define aliases for functions with spelling mistakes in their names which have now been corrected
# Do this so that old runcards still work
combine_pseudorreplica_tables = combine_pseudoreplica_tables