Source code for lumin.inference.summary_stat
import numpy as np
import pandas as pd
from typing import List, Optional
from fastprogress import progress_bar
__all__ = ['bin_binary_class_pred']
[docs]def bin_binary_class_pred(df:pd.DataFrame, max_unc:float, consider_samples:Optional[List[str]]=None, step_sz:float=1e-3, pred_name:str='pred',
sample_name:str='gen_sample', compact_samples:bool=False, class_name:str='gen_target',
add_pure_signal_bin:bool=False, max_unc_pure_signal:float=0.1, verbose:bool=True) -> List[float]:
r'''
Define bin-edges for binning particle process samples as a function of event class prediction (signal | background) such that the statistical uncertainties on per bin yields are
below max_unc for each considered sample.
Arguments:
df: DataFrame containing the data
max_unc: maximum fractional statisitcal uncertainty to allow when defining bins
consider_samples: if set, only listed samples are considered when defining bins
step_sz: resolution of scan along event prediction
pred_name: column to use as event class prediction
sample_name: column to use as particle process fo reach event
compact_samples: if true, will not consider samples when computing bin edges, only the class
class_name: name of column to use as class indicator
add_pure_signal_bin: if true will attempt to add a bin which oonly contains signal (class 1) if the fractional bin-fill uncertainty would be less than
max_unc_pure_signal
max_unc_pure_signal: maximum fractional statisitcal uncertainty to allow when defining pure-signal bins
verbose: whether to show progress bar
Returns:
list of bin edges
'''
# TODO: allow option for stepping through each event, rather than fixed resolution scan
if consider_samples is None: consider_samples = set(df[sample_name])
n_min = int((1/max_unc)**2)
edges,ub,lb = [1.],1.,0.
if add_pure_signal_bin:
max_zero = df.loc[df[class_name] == 0, pred_name].max()
max_zero = (np.floor(max_zero/step_sz)+1)*step_sz
if len(df[(df[class_name] == 1) & (df[pred_name] > max_zero)]) >= int((1/max_unc_pure_signal)**2):
edges.append(max_zero)
ub = max_zero
for i in progress_bar(np.linspace(ub,lb+step_sz, int((ub-lb)/step_sz)), display=verbose, leave=False):
cut = (df[pred_name] > i) & (df[pred_name] <= edges[-1])
pops = [len(df[(df[class_name] == c) & cut]) for c in df[class_name].unique()] if compact_samples \
else [len(df[(df[sample_name] == s) & cut]) for s in consider_samples]
if np.min(pops) >= n_min: edges.append(i)
edges.append(0)
return np.sort(edges)