Source code for lumin.evaluation.ams

import numpy as np
import pandas as pd
from typing import Tuple
from fastprogress import progress_bar

import torch
from torch import Tensor

__all__ = ['calc_ams', 'calc_ams_torch', 'ams_scan_quick', 'ams_scan_slow']


[docs]def calc_ams(s:float, b:float, br:float=0, unc_b:float=0) -> float:
    r'''
    Compute Approximate Median Significance (https://arxiv.org/abs/1007.1727)

    Arguments:
        s: signal weight
        b: background weight
        br: background offset bias
        unc_b: fractional systemtatic uncertainty on background

    Returns:
        Approximate Median Significance if b > 0 else -1
    '''

    if b == 0: return -1
    if not unc_b:
        radicand = 2*((s+b+br)*np.log(1.0+s/(b+br))-s)
    else:
        sigma_b_2 = np.square(unc_b*b)
        radicand = 2*(((s+b)*np.log((s+b)*(b+sigma_b_2)/((b**2)+((s+b)*sigma_b_2))))-(((b**2)/sigma_b_2)*np.log(1+((sigma_b_2*s)/(b*(b+sigma_b_2))))))
    return np.sqrt(radicand) if radicand > 0 else -1


[docs]def calc_ams_torch(s:Tensor, b:Tensor, br:float=0, unc_b:float=0) -> Tensor:
    r'''
    Compute Approximate Median Significance (https://arxiv.org/abs/1007.1727) using Tensor inputs

    Arguments:
        s: signal weight
        b: background weight
        br: background offset bias
        unc_b: fractional systemtatic uncertainty on background

    Returns:
        Approximate Median Significance if b > 0 else 1e-18 * s
    '''

    '''Compute Approximate Median Significance with torch for signal (background) weight s (b),
    fractional systemtatic uncertainty unc_b, and offset br'''
    if b == 0: return 1e-18*s
    if not unc_b:
        radicand = 2*((s+b+br)*torch.log(1.0+s/(b+br))-s)
    else:
        sigma_b_2 = torch.square(unc_b*b)
        radicand = 2*(((s+b)*torch.log((s+b)*(b+sigma_b_2)/((b**2)+((s+b)*sigma_b_2))))-(((b**2)/sigma_b_2)*torch.log(1+((sigma_b_2*s)/(b*(b+sigma_b_2))))))
    return torch.sqrt(radicand) if radicand > 0 else 1e-18*s


[docs]def ams_scan_quick(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:float=0,
                   pred_name:str='pred', targ_name:str='gen_target', wgt_name:str='gen_weight') -> Tuple[float,float]:
    r'''
    Scan accross a range of possible prediction thresholds in order to maximise the Approximate Median Significance (https://arxiv.org/abs/1007.1727).
    Note that whilst this method is quicker than :meth:`~lumin.evaluation.ams.ams_scan_slow`, it sufferes from float precison.
    Not recommended for final evaluation.
    
    Arguments:
        df: DataFrame containing prediction data
        wgt_factor: factor to reweight signal and background weights
        br: background offset bias
        syst_unc_b: fractional systemtatic uncertainty on background
        pred_name: column to use as predictions
        targ_name: column to use as truth labels for signal and background
        wgt_name: column to use as weights for signal and background events

    Returns:
        maximum AMS
        prediction threshold corresponding to maximum AMS
    '''

    max_ams, threshold = 0, 0.0
    df = df.sort_values(by=[pred_name])
    s = np.sum(df.loc[(df[targ_name] == 1), wgt_name])
    b = np.sum(df.loc[(df[targ_name] == 0), wgt_name])

    for i, cut in enumerate(df[pred_name]):
        ams = calc_ams(max(0, s*wgt_factor), max(0, b*wgt_factor), br, syst_unc_b)
        if ams > max_ams: max_ams, threshold = ams, cut
        if df[targ_name].values[i]: s -= df[wgt_name].values[i]
        else:                       b -= df[wgt_name].values[i]        
    return max_ams, threshold


[docs]def ams_scan_slow(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:float=0, 
                  use_stat_unc:bool=False, start_cut:float=0.9, min_events:int=10,
                  pred_name:str='pred', targ_name:str='gen_target', wgt_name:str='gen_weight', show_prog:bool=True) -> Tuple[float,float]:
    r'''
    Scan accross a range of possible prediction thresholds in order to maximise the Approximate Median Significance (https://arxiv.org/abs/1007.1727).
    Note that whilst this method is slower than :meth:`~lumin.evaluation.ams.ams_scan_quick`, it does not suffer as much from float precison.
    Additionally it allows one to account for statistical uncertainty in AMS calculation.
    
    Arguments:
        df: DataFrame containing prediction data
        wgt_factor: factor to reweight signal and background weights
        br: background offset bias
        syst_unc_b: fractional systemtatic uncertainty on background
        use_stat_unc: whether to account for the statistical uncertainty on the background
        start_cut: minimum prediction to consider; useful for speeding up scan
        min_events: minimum number of background unscaled events required to pass threshold
        pred_name: column to use as predictions
        targ_name: column to use as truth labels for signal and background
        wgt_name: column to use as weights for signal and background events
        show_prog: whether to display progress and ETA of scan

    Returns:
        maximum AMS
        prediction threshold corresponding to maximum AMS
    '''
    
    max_ams, threshold = 0, 0.0
    sig, bkg = df[df[targ_name] == 1], df[df[targ_name] == 0]
    syst_unc_b2 = np.square(syst_unc_b)

    for i, cut in enumerate(progress_bar(df.loc[df[pred_name] >= start_cut, pred_name].values, display=show_prog, leave=show_prog)):
        bkg_pass = bkg.loc[(bkg[pred_name] >= cut), wgt_name]
        n_bkg = len(bkg_pass)
        if n_bkg < min_events: continue

        s = np.sum(sig.loc[(sig[pred_name] >= cut), wgt_name])
        b = np.sum(bkg_pass)
        if use_stat_unc: unc_b = np.sqrt(syst_unc_b2+(1/n_bkg))
        else:            unc_b = syst_unc_b

        ams = calc_ams(s*wgt_factor, b*wgt_factor, br, unc_b)
        if ams > max_ams: max_ams, threshold = ams, cut      
    return max_ams, threshold
Source code for lumin.evaluation.ams

Docs

Tutorials