Source code for validphys.closuretest.multiclosure_output

"""
multiclosure_output

Module containing the actions which produce some output in validphys
reports i.e figures or tables for multiclosure estimators in the space of
data.

"""

import numpy as np
import pandas as pd
import scipy.special as special
import scipy.stats

from reportengine.figure import figure, figuregen
from reportengine.table import table
from validphys import plotutils


[docs] @figure def plot_dataset_fits_bias_variance(fits_dataset_bias_variance, dataset): """For a set of closure fits, calculate the bias and variance across fits and then plot scatter points so we can see the distribution of each quantity with fits. The spread of the variance across fits is assumed to be small compared to the spread of the biases, deviation from this assumption could suggest that there are finite size effects due to too few replicas. """ biases, variances, _ = fits_dataset_bias_variance fig, ax = plotutils.subplots() ax.plot(biases, "*", label=f"bias, std. dev. = {np.std(biases):.2f}") ax.axhline(np.mean(biases), label=f"bias, mean = {np.mean(biases):.2f}", linestyle="-") ax.plot(variances, ".", label=f"variance, std. dev. = {np.std(variances):.2f}") ax.axhline( np.mean(variances), label=f"variance, mean = {np.mean(variances):.2f}", linestyle=":" ) ax.set_title(f"Bias and variance for {dataset} for each fit (unnormalised)") ax.set_xlabel("fit index") ax.legend() return fig
[docs] @figure def plot_data_fits_bias_variance(fits_data_bias_variance, data): """Like `plot_dataset_fits_bias_variance` but for all data. Can use alongside ``group_dataset_inputs_by_experiment`` to plot for each experiment. """ return plot_dataset_fits_bias_variance(fits_data_bias_variance, data)
[docs] @figure def plot_total_fits_bias_variance(fits_total_bias_variance): """Like `plot_dataset_fits_bias_variance` but for the total bias/variance for all data, with the total calculated by summing up contributions from each experiment. """ return plot_dataset_fits_bias_variance(fits_total_bias_variance, "all data")
[docs] @table def table_datasets_bias_variance_fits(fits_datasets_bias_variance, each_dataset): """ Table with ratio bias variance value and associated uncertainty computed with simple gaussian error propagation for each dataset. """ records = [] # compute expectation and uncertainty on bias variance ratio for ds, (bias, var, ndata) in zip(each_dataset, fits_datasets_bias_variance): mean_bias = np.mean(bias) mean_var = np.mean(var) rbv = mean_bias / mean_var # compute uncertainy on rbv delta_rbv = np.sqrt( ((1 / mean_var) * np.std(bias)) ** 2 + (mean_bias / mean_var**2 * np.std(var)) ** 2 ) sqrt_rbv = np.sqrt(rbv) delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv) records.append( { "dataset": str(ds), "ndata": ndata, "bias": mean_bias, "variance": mean_var, "ratio": rbv, "error ratio": delta_rbv, "sqrt(ratio)": sqrt_rbv, "error sqrt(ratio)": delta_sqrt_rbv, } ) df = pd.DataFrame.from_records( records, index="dataset", columns=( "dataset", "ndata", "bias", "variance", "ratio", "error ratio", "sqrt(ratio)", "error sqrt(ratio)", ), ) return df
[docs] @table def datasets_bias_variance_ratio(datasets_expected_bias_variance, each_dataset): """For each dataset calculate the expected bias and expected variance across fits, then calculate the ratio ratio = expected bias / expected variance and tabulate the results. This gives an idea of how faithful uncertainties are for a set of datasets. Notes ----- If uncertainties are faithfully estimated then we would expect to see ratio = 1. We should note that the ratio is a squared quantity and sqrt(ratio) is more appropriate for seeing how much uncertainties are over or underestimated. An over-estimate of uncertainty leads to sqrt(ratio) < 1, similarly an under-estimate of uncertainty leads to sqrt(ratio) > 1. """ records = [] for ds, (bias, var, ndata) in zip(each_dataset, datasets_expected_bias_variance): records.append(dict(dataset=str(ds), ndata=ndata, ratio=bias / var)) df = pd.DataFrame.from_records(records, index="dataset", columns=("dataset", "ndata", "ratio")) df.columns = ["ndata", "bias/variance"] return df
[docs] @table def experiments_bias_variance_ratio( experiments_expected_bias_variance, experiments_data, expected_total_bias_variance ): """Like datasets_bias_variance_ratio except for each experiment. Also calculate and tabulate (total expected bias) / (total expected variance) where the total refers to summing over all experiments. """ # don't reinvent wheel df_in = datasets_bias_variance_ratio(experiments_expected_bias_variance, experiments_data) bias_tot, var_tot, ntotal = expected_total_bias_variance tot_df = pd.DataFrame([[ntotal, bias_tot / var_tot]], index=["Total"], columns=df_in.columns) df = pd.concat((df_in, tot_df), axis=0) df.index.rename("experiment", inplace=True) # give index appropriate name return df
[docs] @table def experiments_bias_variance_table( experiments_expected_bias_variance, group_dataset_inputs_by_experiment, expected_total_bias_variance, ): """Tabulate the values of bias and variance for each experiment as well as the sqrt ratio of the two as in :py:func`sqrt_experiments_bias_variance_ratio`. Used as a performance indicator. """ records = [] for exp, (bias, var, ndata) in zip( group_dataset_inputs_by_experiment, experiments_expected_bias_variance ): records.append( dict( experiment=exp["group_name"], ndata=ndata, bias=bias / ndata, variance=var / ndata, sqrt_ratio=np.sqrt(bias / var), ) ) bias_tot, var_tot, ntotal = expected_total_bias_variance records.append( dict( experiment="Total", ndata=ntotal, bias=bias_tot / ntotal, variance=var_tot / ntotal, sqrt_ratio=np.sqrt(bias_tot / var_tot), ) ) df = pd.DataFrame.from_records(records, index="experiment") df.columns = ["ndata", "bias", "variance", "sqrt(bias/variance)"] return df
[docs] @table def sqrt_datasets_bias_variance_ratio(datasets_bias_variance_ratio): """Given `datasets_bias_variance_ratio` take the sqrt and tabulate the results. This gives an idea of how faithful the uncertainties are in sensible units. As noted in `datasets_bias_variance_ratio`, bias/variance is a squared quantity and so when considering how much uncertainty has been over or underestimated it is more natural to consider sqrt(bias/variance). """ df_in = datasets_bias_variance_ratio vals = np.array(df_in.values) # copy just in case vals[:, 1] = np.sqrt(vals[:, 1]) return pd.DataFrame(vals, index=df_in.index, columns=["ndata", "sqrt(bias/variance)"])
[docs] @table def sqrt_experiments_bias_variance_ratio(experiments_bias_variance_ratio): """Like sqrt_datasets_bias_variance_ratio except for each experiment.""" return sqrt_datasets_bias_variance_ratio(experiments_bias_variance_ratio)
[docs] @table def total_bias_variance_ratio( experiments_bias_variance_ratio, datasets_bias_variance_ratio, experiments_data ): """Combine datasets_bias_variance_ratio and experiments_bias_variance_ratio into single table with MultiIndex of experiment and dataset. """ exps_df_in = experiments_bias_variance_ratio.iloc[:-1] # Handle total separately lvs = exps_df_in.index # The explicit call to list is because pandas gets confused otherwise expanded_index = pd.MultiIndex.from_product((list(lvs), ["Total"])) exp_df = exps_df_in.set_index(expanded_index) dset_index = pd.MultiIndex.from_arrays( [ [str(experiment) for experiment in experiments_data for ds in experiment.datasets], datasets_bias_variance_ratio.index.values, ] ) ds_df = datasets_bias_variance_ratio.set_index(dset_index) dfs = [] for lv in lvs: dfs.append(pd.concat((exp_df.loc[lv], ds_df.loc[lv]), copy=False, axis=0)) total_df = pd.DataFrame( experiments_bias_variance_ratio.iloc[[-1]].values, columns=exp_df.columns, index=["Total"] ) dfs.append(total_df) keys = [*lvs, "Total"] res = pd.concat(dfs, axis=0, keys=keys) return res
[docs] @table def expected_xi_from_bias_variance(sqrt_experiments_bias_variance_ratio): """Given the ``sqrt_experiments_bias_variance_ratio`` calculate a predicted value of :math:`\\xi_{1 \sigma}` for each experiment. The predicted value is based of the assumption that the difference between replica and central prediction and the difference between central prediction and underlying prediction are both gaussians centered on zero. For example, if sqrt(expected bias/expected variance) is 0.5, then we would expect xi_{1 sigma} to be given by performing an integral of the distribution of diffs = (central - underlying predictions) over the domain defined by the variance. In this case the sqrt(variance) is twice as large as the sqrt(bias) which is the same as integrating a normal distribution mean = 0, std = 1 over the interval [-2, 2], given by integral = erf(2/sqrt(2)) where erf is the error function. In general the equation is integral = erf(sqrt(variance / (2*bias))) """ df_in = sqrt_experiments_bias_variance_ratio n_sigma_in_variance = 1 / df_in.values[:, -1, np.newaxis] # pylint can't find erf here, disable error in this function # pylint: disable=no-member estimated_integral = special.erf(n_sigma_in_variance / np.sqrt(2)) return pd.DataFrame( np.concatenate((df_in.values[:, 0, np.newaxis], estimated_integral), axis=1), index=df_in.index, columns=["ndata", r"estimated $\xi_{1\sigma} \,$ from bias/variance"], )
[docs] @table def fits_measured_xi(experiments_xi_measured, experiments_data): r"""Tabulate the measure value of \xi_{1\sigma} for each experiment, as calculated by data_xi (collected over experiments). Note that the mean is taken across eigenvectors of the covariance matrix. """ records = [] tot_xi = 0 tot_n = 0 for exp, xi in zip(experiments_data, experiments_xi_measured): records.append(dict(experiment=str(exp), ndata=len(xi), xi=np.mean(xi))) tot_xi += len(xi) * np.mean(xi) tot_n += len(xi) records.append(dict(experiment="Total", ndata=tot_n, xi=tot_xi / tot_n)) df = pd.DataFrame.from_records( records, index="experiment", columns=("experiment", "ndata", "xi") ) df.columns = ["ndata", r"measured $\xi_{1\sigma}$"] return df
[docs] @table def compare_measured_expected_xi(fits_measured_xi, expected_xi_from_bias_variance): """Table with measured xi and expected xi from bias/variance for each experiment and total. For details on expected xi, see expected_xi_from_bias_variance. For more details on measured xi see fits_measured_xi. """ # don't want ndata twice df = pd.concat((fits_measured_xi, expected_xi_from_bias_variance.iloc[:, 1]), axis=1) return df
[docs] @figure def plot_dataset_xi(dataset_xi, dataset): r"""For a given dataset, plot the value of \xi_{1 \sigma} for each eigenvector of the covariance matrix, along with the expected value of \xi_{1 \sigma} if the replicas distribution perfectly matches the central distribution (0.68). In the legend include the mean across eigenvectors. """ fig, ax = plotutils.subplots() ax.plot( dataset_xi, "*", label=r"$\xi_{1\sigma}$ = " + f"{dataset_xi.mean():.2f}, from multifits", clip_on=False, ) ax.axhline(0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value") ax.axhline(0.95, linestyle=":", color="r", label=r"$\xi_{2\sigma}$ " + "expected value") ax.set_ylim((0, 1)) ax.set_xlabel("eigenvector index (ascending order)") ax.set_title(r"$\xi_{1\sigma}$ for " + str(dataset)) ax.legend() return fig
[docs] @figure def plot_dataset_xi_histogram(dataset_xi, dataset): r"""For a given dataset, bin the values of \xi_{1 \sigma} for each eigenvector of the covariance matrix, plot as a histogram with a vertical line for the expected value: 0.68. In the legend print the mean and standard deviation of the distribution. """ fig, ax = plotutils.subplots() ax.hist( dataset_xi, label=( r"$\xi_{1\sigma}$ = " + f"{dataset_xi.mean():.2f}, " + r"std($\xi_{1\sigma}$) = " + f"{dataset_xi.std():.2f}" ), ) ax.axvline(0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value") ax.set_xlim((0, 1)) ax.set_xlabel(r"$\xi^{i}_{1\sigma}$") ax.set_title("Histogram of " + r"$\xi^{i}_{1\sigma}$ for " + str(dataset)) ax.legend() return fig
[docs] @figure def plot_data_xi(data_xi, data): """Like plot_dataset_xi except for all data. Can be used alongside ``group_dataset_inputs_by_experiment`` to plot for each experiment. """ return plot_dataset_xi(data_xi, data)
[docs] @figure def plot_data_xi_histogram(data_xi, data): """Like plot_dataset_xi_histogram but for all data. Can be used alongside ``group_dataset_inputs_by_experiment`` to plot for each experiment. """ return plot_dataset_xi_histogram(data_xi, data)
[docs] @figure def plot_data_central_diff_histogram(experiments_replica_central_diff): """Histogram of the difference between central prediction and underlying law prediction normalised by the corresponding replica standard deviation by concatenating the difference across all data. plot a scaled gaussian for reference. Total xi is the number of central differences which fall within the 1-sigma confidence interval of the scaled gaussian. """ scaled_diffs = np.concatenate( [ (central_diff / sigma).flatten() for sigma, central_diff in experiments_replica_central_diff ] ) fig, ax = plotutils.subplots() ax.hist(scaled_diffs, bins=50, density=True, label="Central prediction distribution") xlim = (-5, 5) ax.set_xlim(xlim) x = np.linspace(*xlim, 100) ax.plot(x, scipy.stats.norm.pdf(x), "-k", label="Normal distribution") ax.legend() ax.set_xlabel("Difference to underlying prediction") return fig
[docs] @table def dataset_ratio_error_finite_effects( bias_variance_resampling_dataset, n_fit_samples, n_replica_samples ): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of expected bias and variance. For each combination of n_rep and n_fit tabulate the std deviation across bootstrap samples of ratio = bias / variance The resulting table gives and approximation of how error varies with number of fits and number of replicas for each dataset. """ bias_samples, var_samples = bias_variance_resampling_dataset ratio = bias_samples / var_samples ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(ratio.std(axis=2), index=ind, columns=col)
[docs] @table def total_ratio_error_finite_effects( bias_variance_resampling_total, n_fit_samples, n_replica_samples ): """Like dataset_ratio_relative_error_finite_effects except for the total bias / variance (across all data). """ return dataset_ratio_error_finite_effects( bias_variance_resampling_total, n_fit_samples, n_replica_samples )
[docs] @table def total_ratio_means_finite_effects( bias_variance_resampling_total, n_fit_samples, n_replica_samples ): """Vary number of fits and number of replicas used to perform bootstrap sample of expected bias and variance. For each combination of n_rep and n_fit tabulate the the mean across bootstrap samples of ratio = total bias / total variance which can give context to `total_ratio_relative_error_finite_effects`. """ ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") bias_total, var_total = bias_variance_resampling_total return pd.DataFrame((bias_total / var_total).mean(axis=2), index=ind, columns=col)
[docs] @table def dataset_xi_error_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the standard deviation of xi_1sigma across bootstrap samples. """ means_xi_table = xi_resampling_dataset.mean(axis=-1) ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(means_xi_table.std(axis=2), index=ind, columns=col)
[docs] @table def dataset_xi_means_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the mean of xi_1sigma across bootstrap samples. """ means_xi_table = xi_resampling_dataset.mean(axis=-1) ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(means_xi_table.mean(axis=2), index=ind, columns=col)
# NOTE: This action was written when trying to understand the finite size effects # and is largely redundant.
[docs] @table def dataset_std_xi_error_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the standard deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the standard deviation of std(xi_1sigma) across bootstrap samples. """ means_xi_table = xi_resampling_dataset.std(axis=-1) ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(means_xi_table.std(axis=2), index=ind, columns=col)
[docs] @table def dataset_std_xi_means_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the standard deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the mean of std(xi_1sigma) across bootstrap samples. """ means_xi_table = xi_resampling_dataset.std(axis=-1) ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(means_xi_table.mean(axis=2), index=ind, columns=col)
[docs] @table def total_xi_error_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the standard deviation of xi_1sigma across bootstrap samples. """ return dataset_xi_error_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples)
[docs] @table def total_xi_means_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the standard deviation of xi_1sigma across bootstrap samples. """ return dataset_xi_means_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples)
[docs] @table def total_expected_xi_means_finite_effects( total_expected_xi_resample, n_fit_samples, n_replica_samples ): """Given the resampled ratio of bias/variance, returns table of mean of resampled expected xi across bootstrap samples. See `expected_xi_from_bias_variance` for more details on how to calculate expected xi. """ ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(total_expected_xi_resample.mean(axis=2), index=ind, columns=col)
[docs] @table def total_expected_xi_error_finite_effects( total_expected_xi_resample, n_fit_samples, n_replica_samples ): """Given the resampled ratio of bias/variance, returns table of mean of resampled expected xi across bootstrap samples. See :py:func:`expected_xi_from_bias_variance` for more details on how to calculate expected xi. """ ind = pd.Index(n_replica_samples, name="n_rep samples") col = pd.Index(n_fit_samples, name="n_fit samples") return pd.DataFrame(total_expected_xi_resample.std(axis=2), index=ind, columns=col)
[docs] @table def total_std_xi_error_finite_effects(exps_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the std deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the mean of std(xi_1sigma) across bootstrap samples. """ xi_total = np.concatenate(exps_xi_resample, axis=-1) return dataset_std_xi_error_finite_effects(xi_total, n_fit_samples, n_replica_samples)
[docs] @table def total_std_xi_means_finite_effects(exps_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the std deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then tabulate the standard deviation of std(xi_1sigma) across bootstrap samples. """ xi_total = np.concatenate(exps_xi_resample, axis=-1) return dataset_std_xi_means_finite_effects(xi_total, n_fit_samples, n_replica_samples)
[docs] @table def experiments_bootstrap_sqrt_ratio_table(experiments_bootstrap_sqrt_ratio, experiments_data): """Given experiments_bootstrap_sqrt_ratio, which a bootstrap resampling of the sqrt(bias/variance) for each experiment and the total across all data, tabulate the mean and standard deviation across bootstrap samples. """ indices = list(map(str, experiments_data)) + ["Total"] records = [] for i, exp in enumerate(indices): ratio_boot = experiments_bootstrap_sqrt_ratio[i] records.append( dict( experiment=exp, mean_ratio=np.mean(ratio_boot, axis=0), std_ratio=np.std(ratio_boot, axis=0), ) ) df = pd.DataFrame.from_records( records, index="experiment", columns=("experiment", "mean_ratio", "std_ratio") ) df.columns = ["Bootstrap mean sqrt(bias/variance)", "Bootstrap std. dev. sqrt(bias/variance)"] return df
[docs] @table def groups_bootstrap_sqrt_ratio_table(groups_bootstrap_sqrt_ratio, groups_data): """Like :py:func:`experiments_bootstrap_sqrt_ratio_table` but for metadata groups. """ df = experiments_bootstrap_sqrt_ratio_table(groups_bootstrap_sqrt_ratio, groups_data) idx = df.index.rename("group") return df.set_index(idx)
[docs] @table def experiments_bootstrap_expected_xi_table(experiments_bootstrap_expected_xi, experiments_data): """Tabulate the mean and standard deviation across bootstrap samples of the expected xi calculated from the ratio of bias/variance. Returns a table with two columns, for the bootstrap mean and standard deviation and a row for each experiment plus the total across all experiments. """ df = experiments_bootstrap_sqrt_ratio_table(experiments_bootstrap_expected_xi, experiments_data) # change the column headers df.columns = [ r"Bootstrap mean expected $\xi_{1\sigma}$ from ratio", r"Bootstrap std. dev. expected $\xi_{1\sigma}$ from ratio", ] return df
[docs] @table def groups_bootstrap_expected_xi_table(groups_bootstrap_expected_xi, groups_data): """Like :py:func:`experiments_bootstrap_expected_xi_table` but for metadata groups. """ df = experiments_bootstrap_expected_xi_table(groups_bootstrap_expected_xi, groups_data) idx = df.index.rename("group") return df.set_index(idx)
[docs] @table def experiments_bootstrap_xi_table(experiments_bootstrap_xi, experiments_data, total_bootstrap_xi): """Tabulate the mean and standard deviation of xi_1sigma across bootstrap samples. Note that the mean has already be taken across data points (or eigenvectors in the basis which diagonalises the covariance matrix) for each individual bootstrap sample. Tabulate the results for each experiment and for the total xi across all data. """ # take mean across data points for each experiment xi_1sigma = [np.mean(exp_xi, axis=1) for exp_xi in experiments_bootstrap_xi] # take mean across all data xi_1sigma.append(np.mean(total_bootstrap_xi, axis=1)) df = experiments_bootstrap_sqrt_ratio_table(xi_1sigma, experiments_data) df.columns = [r"Bootstrap mean $\xi_{1\sigma}$", r"Bootstrap std. dev. $\xi_{1\sigma}$"] return df
[docs] @table def groups_bootstrap_xi_table(groups_bootstrap_xi, groups_data, total_bootstrap_xi): """Like :py:func:`experiments_bootstrap_xi_table` but for metadata groups.""" df = experiments_bootstrap_xi_table(groups_bootstrap_xi, groups_data, total_bootstrap_xi) idx = df.index.rename("group") return df.set_index(idx)
[docs] @table def experiments_bootstrap_xi_comparison( experiments_bootstrap_xi_table, experiments_bootstrap_expected_xi_table ): """Table comparing the mean and standard deviation across bootstrap samples of the measured xi_1sigma and the expected xi_1sigma calculated from bias/variance. """ return pd.concat( (experiments_bootstrap_xi_table, experiments_bootstrap_expected_xi_table), axis=1 )
[docs] @table def groups_bootstrap_xi_comparison(groups_bootstrap_xi_table, groups_bootstrap_expected_xi_table): """Like :py:func:`experiments_bootstrap_xi_comparison` but for metadata groups. """ return experiments_bootstrap_xi_comparison( groups_bootstrap_xi_table, groups_bootstrap_expected_xi_table )
[docs] @figuregen def plot_experiments_sqrt_ratio_bootstrap_distribution( experiments_bootstrap_sqrt_ratio, experiments_data ): """Plots a histogram for each experiment and the total, showing the distribution of bootstrap samples. Takes the mean and std deviation of the bootstrap sample and plots the corresponding scaled normal distribution for comparison. The limits are set to be +/- 3 std deviations of the mean. """ # experiments_bootstrap_sqrt_ratio includes total. str(exp) is only used to # generate title, so appending string is fine. for sqrt_ratio_sample, exp in zip( experiments_bootstrap_sqrt_ratio, experiments_data + ["Total"] ): fig, ax = plotutils.subplots() ax.hist(sqrt_ratio_sample, bins=20, density=True) mean = np.mean(sqrt_ratio_sample) std = np.std(sqrt_ratio_sample) xlim = (mean - 3 * std, mean + 3 * std) ax.set_xlim(xlim) x = np.linspace(*xlim, 100) ax.plot( x, scipy.stats.norm.pdf(x, mean, std), "-r", label=f"Corresponding normal distribution: mean = {mean:.2g}, std = {std:.2g}", ) ax.legend() ax.set_title(f"Bootstrap distribution of sqrt(bias/variance) for {exp}") ax.set_xlabel("Sqrt(bias/variance)") yield fig
[docs] @figuregen def plot_experiments_xi_bootstrap_distribution( experiments_bootstrap_xi, total_bootstrap_xi, experiments_data ): """Similar to :py:func:`plot_sqrt_ratio_bootstrap_distribution` except plots the bootstrap distribution of xi_1sigma, along with a corresponding scaled gaussian, for each experiment (and for all data). """ # take mean across data points for each experiment xi_1sigma = [np.mean(exp_xi, axis=1) for exp_xi in experiments_bootstrap_xi] # take mean across all data xi_1sigma.append(np.mean(total_bootstrap_xi, axis=1)) # use plotting function from above xi_plots = plot_experiments_sqrt_ratio_bootstrap_distribution(xi_1sigma, experiments_data) # Update the title and x label on each plot to reflect that we're plotting # \xi_1sigma, don't forget Total plot. for fig, exp in zip(xi_plots, experiments_data + ["Total"]): ax = fig.gca() ax.set_title(r"Bootstrap distribution of $\xi_{1\sigma}$ for " + str(exp)) ax.set_xlabel(r"$\xi_{1\sigma}$") yield fig
[docs] @figuregen def plot_bias_variance_distributions( experiments_fits_bias_replicas_variance_samples, group_dataset_inputs_by_experiment ): """For each experiment, plot the distribution across fits of bias and the distribution across fits and replicas of fit_rep_var = (E[g] - g)_i inv(cov)_ij (E[g] - g)_j where g is the replica prediction for fit l, replica k and E[g] is the mean across replicas of g for fit l. """ for (exp_biases, exp_vars, _), group_spec in zip( experiments_fits_bias_replicas_variance_samples, group_dataset_inputs_by_experiment ): fig, ax = plotutils.subplots() labels = ["fits bias distribution", "replicas variance distribution"] ax.hist([exp_biases, exp_vars], density=True, label=labels) ax.legend() ax.set_title(f"Bias and variance distributions for {group_spec['group_name']}.") yield fig total_bias, total_var, _ = np.sum(experiments_fits_bias_replicas_variance_samples, axis=0) fig, ax = plotutils.subplots() ax.hist([total_bias, total_var], density=True, label=labels) ax.legend() ax.set_title("Total bias and variance distributions.") yield fig
[docs] @figuregen def xq2_data_prcs_maps(xq2_data_map, each_dataset): """ Heat map of the ratio bias variance (and xi, quantile estimator) for each datapoint in a dataset. The x and y axis are the x and Q2 coordinates of the datapoints. The color of each point is determined by the value of the ratio bias variance (and xi, quantile estimator). """ keys = ["std_devs", "xi"] for j, elem in enumerate(xq2_data_map): for k in keys: if k == "std_devs": title = r"$R_{bv}$" if k == "xi": title = r"$\xi$" fig, ax = plotutils.subplots() im = ax.scatter( elem['x_coords'], elem['Q_coords'], c=(np.asarray(elem[k])), cmap='viridis' ) fig.colorbar(im, label=title) ax.set_xscale('log') # Set x-axis to log scale ax.set_yscale('log') # Set y-axis to log scale ax.set_xlabel('x') ax.set_ylabel('Q2') ax.set_title(each_dataset[j].commondata.metadata.plotting.dataset_label) yield fig