Source code for validphys.hyper_algorithm

"""
    This module contains functions dedicated to process the json dictionaries
"""
import itertools
import logging

import pandas as pd

log = logging.getLogger(__name__)

# Arbitrary parameters that we need to think about
fail_threshold = 75
fail_reward = -100
loss_threshold = 2.5  # Anything below this is as good as a failure

KEY_GOOD = "good"
KEY_LOSS = "loss"


[docs]def compute_reward(mdict, biggest_ntotal):
    """
    Given a combination dictionary computes the reward function:

    If the fail rate for this combination is above the fail threshold, rewards is -100

    The formula below for the reward takes into account:
        - The rate of ok fits that have a loss below the loss_threshold
        - The rate of fits that failed
        - The std deviation
        - How far away is the median from the best loss
        - How far away are median and average
    """
    # Check the fail rate to see whether this combination is useles
    fail_rate = mdict["fail_rate"]
    if fail_rate > fail_threshold:
        return fail_reward
    # Compute from the points that are not explicitly failures
    # the ones that are truly good
    true_good = mdict["true_good"]
    n_good = mdict["n_good"]
    n_total = mdict["n_total"]
    rate_good = true_good / n_good * 100.0
    # Punish the combination which the dispersion
    median = mdict["median"]
    avg = mdict["avg"]
    std = mdict["std"]
    dispersion = abs(avg - median) + std / 2.0 + (median - mdict["best_loss"])
    # Compute and weight the outliers
    combination_weight = 1.0 + n_total / biggest_ntotal
    # The most important thing is the true_good_rate
    reward = rate_good / 100.0
    # Punish a bit using the failure rate, but not that much
    reward -= fail_rate / 400.0
    # and punish further using the dispersion
    reward -= dispersion
    # Finally scale it with the total number of points
    return reward * combination_weight


[docs]def bin_generator(df_values, max_n=10):
    """
    Receives a dataframe with a list of unique values
    . If there are more than `max_n` of them
    and they are numeric, create `max_n` bins.
    If they are already discrete values or there are less than `max_n` options,
    output the same input

    # Arguments:
        - `df_values`: dataframe with unique values
        - `maximum`: maximum number of allowed different values

    # Returns:
        - `new_vals`: list of tuples with (initial, end) value of the bin
    """
    values = df_values.values
    lval = len(values)
    if lval <= max_n:
        return values
    if not all(isinstance(i, (int, float)) for i in values):
        return values
    bins = pd.cut(values, max_n, include_lowest=True)
    return bins.categories


[docs]def parse_keys(dataframe, keys):
    """
    Receives a dataframe and a set of keys
    Looks into the dataframe to read the possible values of the keys

    Returns a dictionary { 'key' : [possible values] },

    If the values are not discrete then we need to bin it
    let's do this for anything with two many numerical values

    # Arguments:
        - `dataframe`: a pandas dataframe
        - `keys`: keys to combine

    # Returns:
        - `key_info`: a dictionary with the possible values for each key
    """
    key_info = {}
    for key_name in keys:
        # Remove duplicates and nans
        all_possible_values = dataframe[key_name].dropna().drop_duplicates()
        # If there's anything left, add it to the dictionary
        if not all_possible_values.empty:
            # But bin it first in case we find a continous variable
            key_info[key_name] = bin_generator(all_possible_values)
    return key_info


[docs]def get_combinations(key_info, ncomb):
    """
    Given a dictionary mapping keys to iterables of possible values (`key_info`),
    return a list of the product of all possible mappings of a subset of `ncomb`
    keys to single values out of the corresponding possible values, for all such subsets.

    For instance,
    key_info = {
        'key1' : [val1-1, val1-2, ...],
        'key2' : [val2-1, val2-2, ...],
        }
    ncomb = 2

    will return a list of dictionaries:
    [
    {'key1' : val1-1, 'key2', val2-1 ... },
    {'key1' : val1-1, 'key2', val2-2 ... },
    {'key1' : val1-2, 'key2', val2-1 ... },
    {'key1' : val1-2, 'key2', val2-2 ... },
    ]

    Get all combinations of ncomb elements for the keys and values given in the dictionary key_info:

    # Arguments:
        - `key_info`: dictionary with the possible values for each key
        - `ncomb`: elements to combine

    # Returns:
        - `all_combinations`: A list of dictionaries of parameters
    """
    # If we don't have enough keys to produce n combinations, return empty
    if len(key_info) < ncomb:
        return []
    # First generate the combinations of keys
    key_combinations = itertools.combinations(key_info, ncomb)
    all_combinations = []
    # Now, for each combination of keys we have to give values
    for keys in key_combinations:
        # Generate a list of tuples with the values of the keys
        # i.e., something like [ (val1-1, val1-2, val1-3...), (val2-1, val2-2...) ... ]
        list_of_items = [key_info[key] for key in keys]
        # Now combine all these items, which is what we actually want
        items_combinations = itertools.product(*list_of_items)
        # Now we want to put things back in the form of a dictionary of parameters
        for values in items_combinations:
            # Each values comes in the same order as `keys`
            new_dictionary = dict(zip(keys, values))
            all_combinations.append(new_dictionary)
    return all_combinations


[docs]def get_slice(dataframe, query_dict):
    """
    Returns a slice of the dataframe where some keys match some values
    keys_info must be a dictionary {key1 : value1, key2, value2 ...}
    # Arguments:
        - `dataframe`: a pandas dataframe
        - `query_dict`: a dictionary of combination as given by `get_combinations`
    """
    df_slice = dataframe
    for key, value in query_dict.items():
        key_column = df_slice[key]
        # Check whether all values of this slice are NaN
        if not key_column.empty and key_column.dropna().empty:
            return None
        # We need to act differently in the case of continous values we discretized before
        # The way we have to check whether something was continous is to check whether the value
        # is now a pandas interval
        if isinstance(value, pd.Interval):
            mask = [i in value for i in key_column]
            df_slice = df_slice[mask]
        else:
            df_slice = df_slice[key_column == value]
    return df_slice


[docs]def process_slice(df_slice):
    """
    Function to process a slice into a dictionary with useful stats
    If the slice is None it means the combination does not apply

    # Arguments:
        - `df_slice`: a slice of a pandas dataframe

    # Returns:
        - `proc_dict`: a dictionary of stats
    """
    # First check whether there's anything inside the slice
    n_total = len(df_slice)
    if df_slice is None or n_total == 0:
        proc_dict = {"skip": True}
        return proc_dict
    else:
        proc_dict = {"skip": False}
    # Get the good values
    good = df_slice[df_slice[KEY_GOOD]]
    # Get raw stats
    n_good = len(good)
    n_failed = n_total - n_good
    fail_rate = n_failed / n_total * 100.0
    # Now get the distribution of the (good) losses
    good_losses = good[KEY_LOSS]
    best_loss = good_losses.min()
    std_dev = good_losses.std()
    median = good_losses.median()
    avg = good_losses.mean()
    # Check how many points are under the loss_threshold
    true_good = 0
    for i in good_losses:
        true_good += int(i < loss_threshold)

    # Fill the dictionary
    proc_dict["n_failed"] = n_failed
    proc_dict["n_good"] = n_good
    proc_dict["n_total"] = n_total
    proc_dict["fail_rate"] = fail_rate
    proc_dict["best_loss"] = best_loss
    proc_dict["true_good"] = true_good
    proc_dict["std"] = std_dev
    proc_dict["avg"] = avg
    proc_dict["median"] = median
    return proc_dict


[docs]def study_combination(dataframe, query_dict):
    """
    Given a dataframe and a dictionary of {key1 : value1, key2: value2}
    returns a dictionary with a number of stats for that combination

    # Arguments:
        - `dataframe`: a pandas dataframe
        - `query_dict`: a dictionary for a combination as given by `get_combinations`

    # Returns:
        - `proc_dict`: a dictionary of the "statistics" for this combination
    """
    # Get the slice corresponding to this combination
    df_slice = get_slice(dataframe, query_dict)
    proc_dict = process_slice(df_slice)
    proc_dict["slice"] = df_slice
    proc_dict["combination"] = query_dict
    return proc_dict


[docs]def dataframe_removal(dataframe, hit_list):
    """
    Removes all combinations defined in hit_list from the dataframe.
    The hit list is list of dictionaries containing the 'slice' key
    where 'slice' must be a slice of 'dataframe'

    # Arguments:
        - `dataframe`: a pandas dataframe
        - `hit_list`: the list of element to remove

    # Returns:
        - `new_dataframe`: the same dataframe with all elements from hit_list removed
    """
    if not hit_list:
        return dataframe
    # I think I am failing to understand how the index object works
    # this looks unnecesaryly verbose
    # I am just getting all the indices from all the different combinations
    # making sure there are no duplicates and then dropping them from the dataframe
    indices_to_remove = hit_list[0]["slice"].index
    for victim in hit_list[1:]:
        indices_to_remove = indices_to_remove.append(victim["slice"].index)
    indices_to_drop = indices_to_remove.drop_duplicates()
    new_dataframe = dataframe.drop(indices_to_drop)
    return new_dataframe


[docs]def autofilter_dataframe(dataframe, keys, n_to_combine=1, n_to_kill=1, threshold=-1):
    """
    Receives a dataframe and a list of keys.
    Creates combinations of `n_to_combine` keys and computes the reward
    Finally removes from the dataframe the `n_to_kill` worse combinations

    Anything under `threshold` will be removed and will not count towards the `n_to_kill`
    (by default `threshold` = -50 so only things which are really bad will be removed)

    # Arguments:
        - `dataframe`: a pandas dataframe
        - `keys`: keys to combine
        - `n_to_combine`: how many keys do we want to combine
        - `n_to_kill`: how many combinations to kill
        - `threshold`: anything under this reward will be removed

    # Returns:
        - `dataframe_sliced`: a slice of the dataframe with the weakest combinations
                              removed
    """
    # Step 0: read from the dataframe the content of the keys
    key_info = parse_keys(dataframe, keys)
    # Step 1: get the combinations
    combinations = get_combinations(key_info, n_to_combine)
    # Step 2: run through all possible combinations and compute stats
    result_list = []
    biggest_ntotal = 1
    for combination in combinations:
        processed_dict = study_combination(dataframe, combination)
        if processed_dict["skip"]:
            continue
        if processed_dict["n_total"] > biggest_ntotal:
            biggest_ntotal = processed_dict["n_total"]
        result_list.append(processed_dict)
    # Step 3: compute reward
    n_to_remove = n_to_kill
    for processed_dict in result_list:
        reward = compute_reward(processed_dict, biggest_ntotal)
        log.debug("Combination %s, reward %f", processed_dict["combination"], reward)
        if reward <= threshold:
            n_to_remove += 1
        processed_dict["reward"] = reward
    # Step 4: Order the results by reward
    result_list.sort(key=lambda i: i["reward"])
    # Step 5: Add the n-last to the list of combinations to remove
    hit_list = result_list[:n_to_remove]
    for i in hit_list:
        log.info("Removing %s with reward %f", i["combination"], i["reward"])
    # Step 6: remove the bad guys from the dataframe
    new_dataframe = dataframe_removal(dataframe, hit_list)
    return new_dataframe