Source code for lumin.inference.summary_stat

import numpy as np
import pandas as pd
from typing import List, Optional
from fastprogress import progress_bar

__all__ = ['bin_binary_class_pred']

[docs]def bin_binary_class_pred(df:pd.DataFrame, max_unc:float, consider_samples:Optional[List[str]]=None, step_sz:float=1e-3, pred_name:str='pred', sample_name:str='gen_sample', compact_samples:bool=False, class_name:str='gen_target', add_pure_signal_bin:bool=False, max_unc_pure_signal:float=0.1, verbose:bool=True) -> List[float]: r''' Define bin-edges for binning particle process samples as a function of event class prediction (signal | background) such that the statistical uncertainties on per bin yields are below max_unc for each considered sample. Arguments: df: DataFrame containing the data max_unc: maximum fractional statisitcal uncertainty to allow when defining bins consider_samples: if set, only listed samples are considered when defining bins step_sz: resolution of scan along event prediction pred_name: column to use as event class prediction sample_name: column to use as particle process fo reach event compact_samples: if true, will not consider samples when computing bin edges, only the class class_name: name of column to use as class indicator add_pure_signal_bin: if true will attempt to add a bin which oonly contains signal (class 1) if the fractional bin-fill uncertainty would be less than max_unc_pure_signal max_unc_pure_signal: maximum fractional statisitcal uncertainty to allow when defining pure-signal bins verbose: whether to show progress bar Returns: list of bin edges ''' # TODO: allow option for stepping through each event, rather than fixed resolution scan if consider_samples is None: consider_samples = set(df[sample_name]) n_min = int((1/max_unc)**2) edges,ub,lb = [1.],1.,0. if add_pure_signal_bin: max_zero = df.loc[df[class_name] == 0, pred_name].max() max_zero = (np.floor(max_zero/step_sz)+1)*step_sz if len(df[(df[class_name] == 1) & (df[pred_name] > max_zero)]) >= int((1/max_unc_pure_signal)**2): edges.append(max_zero) ub = max_zero for i in progress_bar(np.linspace(ub,lb+step_sz, int((ub-lb)/step_sz)), display=verbose, leave=False): cut = (df[pred_name] > i) & (df[pred_name] <= edges[-1]) pops = [len(df[(df[class_name] == c) & cut]) for c in df[class_name].unique()] if compact_samples \ else [len(df[(df[sample_name] == s) & cut]) for s in consider_samples] if np.min(pops) >= n_min: edges.append(i) edges.append(0) return np.sort(edges)
Read the Docs v: v0.7.1
On Read the Docs
Project Home

Free document hosting provided by Read the Docs.


Access comprehensive developer and user documentation for LUMIN

View Docs


Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials