"""
multiclosure_output
Module containing the actions which produce some output in validphys
reports i.e figures or tables for multiclosure estimators in the space of
data.
"""
import numpy as np
import pandas as pd
import scipy.special as special
from scipy.stats import norm
from reportengine.figure import figure, figuregen
from reportengine.table import table
from validphys import plotutils
[docs]@table
def table_bias_datasets(bias_datasets, each_dataset):
"""
Compute the bias and sqrt bias and associated errors for each dataset
and return a DataFrame with the results.
Parameters
----------
bias_datasets: list
List of tuples containing the values of bias for each dataset.
each_dataset: list
List of validphys.core.DataSetSpec
Returns
-------
pd.DataFrame
DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
"""
records = []
for (biases, n_comp), ds in zip(bias_datasets, each_dataset):
bias = np.mean(biases)
std_bias = np.std(biases)
sqrt_bias = np.sqrt(bias)
# use gaussian uncertainty propagation
err_sqrt_bias = 0.5 * std_bias / sqrt_bias
records.append(
{
"dataset": str(ds),
"dof": n_comp,
"bias": bias,
"err bias": std_bias,
"sqrt bias": sqrt_bias,
"err sqrt bias": err_sqrt_bias,
}
)
df = pd.DataFrame.from_records(
records,
index="dataset",
columns=("dataset", "dof", "bias", "err bias", "sqrt bias", "err sqrt bias"),
)
return df
[docs]@table
def table_bias_data(bias_data):
"""
Same as table_bias_datasets but for all the data, meaning that
the correlations between the datasets are taken into account.
Parameters
----------
bias_data: list
Same of bias_dataset but for all the data
Returns
-------
pd.DataFrame
DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
"""
records = []
biases_tot, n_comp_tot = bias_data
bias_tot = np.mean(biases_tot)
std_bias_tot = np.std(biases_tot)
sqrt_bias_tot = np.sqrt(bias_tot)
# use gaussian uncertainty propagation
err_sqrt_bias_tot = 0.5 * std_bias_tot / sqrt_bias_tot
records.append(
{
"dataset": "Total",
"dof": n_comp_tot,
"bias": bias_tot,
"err bias": std_bias_tot,
"sqrt bias": sqrt_bias_tot,
"err sqrt bias": err_sqrt_bias_tot,
}
)
df = pd.DataFrame.from_records(
records,
index="dataset",
columns=("dataset", "dof", "bias", "err bias", "sqrt bias", "err sqrt bias"),
)
return df
[docs]@table
def bootstrapped_table_bias_datasets(bootstrapped_bias_datasets):
"""
Compute the bias, variance, ratio and sqrt(ratio) for each dataset
and return a DataFrame with the results.
Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation
of the bootstrap uncertainty on bias and variance.
"""
records = []
for boot_ds in bootstrapped_bias_datasets:
df = boot_ds
mean_bias = df["bias"].mean()
# gaussian error propagation for the ratio of the means uncertainty
# only consider bias as source of uncertainty for the ratio (variance is almost constant)
bootstrap_unc = np.std(df["bias"])
sqrt_bias = np.mean(np.sqrt(df["bias"]))
# gaussian error propagation for the sqrt of the ratio
bootstrap_unc_sqrt = np.std(np.sqrt(df["bias"]))
records.append(
{
"dataset": df["dataset"].iloc[0],
"mean_dof": df.n_comp.mean(),
"bias": mean_bias,
"err bias": bootstrap_unc,
"sqrt bias": sqrt_bias,
"err sqrt bias": bootstrap_unc_sqrt,
}
)
df = pd.DataFrame.from_records(
records,
index="dataset",
columns=("dataset", "mean_dof", "bias", "err bias", "sqrt bias", "err sqrt bias"),
)
return df
[docs]@table
def bootstrapped_table_bias_data(bootstrapped_bias_data):
"""
Compute the bias, sqrt bias and their bootstrap errors for a DataGroup
and return a DataFrame with the results.
"""
records = []
df = bootstrapped_bias_data
bias = df["bias"].mean()
sqrt_bias = np.mean(np.sqrt(df["bias"].values))
boot_err_bias = np.std(df["bias"])
boot_err_sqrt_bias = np.std(np.sqrt(df["bias"]))
records.append(
{
"dataset": "Total",
"mean_dof": df.n_comp.mean(),
"bias": bias,
"err bias": boot_err_bias,
"sqrt bias": sqrt_bias,
"err sqrt bias": boot_err_sqrt_bias,
}
)
df = pd.DataFrame.from_records(
records,
index="dataset",
columns=("dataset", "mean_dof", "bias", "err bias", "sqrt bias", "err sqrt bias"),
)
return df
[docs]@figure
def xi_delta_histogram(normalized_delta_bias_data, title, lambda_value, label_hist=None):
"""
Plot histogram of normalized delta regularized with PCA.
Parameters
----------
normalized_delta_bias_data: tuple
label_hist: str
summary description of multiclosure
Returns
-------
fig
Figure object
"""
fig, ax = plotutils.subplots()
size = np.shape(normalized_delta_bias_data[0])[0]
ax.hist(normalized_delta_bias_data[0], density=True, bins=int(np.sqrt(size)), label=label_hist)
ax.set_title(title + r", $\lambda:$" + f"{lambda_value}")
ax.set_xlabel(r"$\delta$")
ax.set_ylabel("Density")
x = np.linspace(-3, 3, 100)
y = norm.pdf(x, loc=0, scale=1)
ax.plot(x, y, label="Standard gaussian")
ax.legend()
return fig
[docs]@table
def table_xi_indicator_function_data(bootstrapped_indicator_function_data):
"""
Computes the bootstrap average and std of the indicator function for the data.
Parameters
----------
bootstrapped_indicator_function_data: tuple
Returns
-------
pd.DataFrame
DataFrame containing the average and std of the indicator function for the data.
"""
indicator_list, mean_dof = bootstrapped_indicator_function_data
# average over data and fits within the bootstrap samples
mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list])
# take bootstrap expectation and variance
mean_xi = np.mean(mean_boot_vals)
std_xi = np.std(mean_boot_vals)
records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)]
df = pd.DataFrame.from_records(
records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi")
)
return df
[docs]@figuregen
def plot_xq2_data_prcs_maps(xq2_data_map, each_dataset):
"""
Heat map of the ratio bias variance and xi quantile estimator for each datapoint
in each dataset.
Parameters
----------
xq2_data_map: dictionary
dictionary containing:
- x coordinate
- Q**2 coordinate
- Ratio bias-variance
- xi
each_dataset: list
Yields
------
figure
"""
keys = ["R_b", "xi"]
for j, elem in enumerate(xq2_data_map):
for k in keys:
if k == "R_b":
title = r"$R_{b}$"
if k == "xi":
title = r"$\xi$"
fig, ax = plotutils.subplots()
im = ax.scatter(
elem['x_coords'], elem['Q_coords'], c=(np.asarray(elem[k])), cmap='viridis'
)
fig.colorbar(im, label=title)
ax.set_xscale('log') # Set x-axis to log scale
ax.set_yscale('log') # Set y-axis to log scale
ax.set_xlabel('x')
ax.set_ylabel('Q2')
ax.set_title(each_dataset[j].commondata.metadata.plotting.dataset_label)
yield fig