Source code for lumin.nn.metrics.reg_eval

import numpy as np
from typing import Optional, Callable
import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW
from fastcore.all import store_attr

from ...utils.statistics import bootstrap_stats
from .eval_metric import EvalMetric, OldEvalMetric
from ..data.fold_yielder import FoldYielder

__all__ = ['RegPull', 'RegAsProxyPull']


class OldRegPull(OldEvalMetric):
    r'''
    Compute mean or standard deviation of delta or pull of some feature which is being directly regressed to.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_weights: whether to actually use weights if wgt_name is set
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        targ_name: name of group in fold file containing regression targets
        wgt_name: name of group in fold file containing datapoint weights

    Examples::
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=True,
        ...                      use_pull=True)
        >>>
        >>> std_delta  = RegPull(return_mean=False, use_bootstrap=True,
        ...                      use_pull=False)
        >>>
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=False,
        ...                      use_pull=True, wgt_name='weights')
    
    .. Attention:: This class is depreciated in favour of :class:`~lumin.nn.metrics.reg_eval.RegPull.
        It is a copy of the old `RegPull` class used in lumin<=0.7.0.
        It will be removed in V0.8
    '''

    # XXX remove in V0.8

    # TODO: Check how this handels multi-target regression, may need to adjust averaging axis & DescrStatsW may not handle multi-dimensional data well.
    # TODO: Remove use_weights and rely on whether wgt_name is None

    def __init__(self, return_mean:bool, use_bootstrap:bool=False, use_weights:bool=True, use_pull:bool=True, targ_name:str='targets',
                 wgt_name:Optional[str]=None):
        super().__init__(targ_name=targ_name, wgt_name=wgt_name)
        self.use_bootstrap,self.use_weights,self.return_mean,self.use_pull = use_bootstrap,use_weights,return_mean,use_pull

    def _compute(self, df:pd.DataFrame) -> float:
        df['diff'] = df['pred']-df['gen_target']
        if self.use_pull: df['diff'] /= df['gen_target']
        if self.use_weights and 'gen_weight' in df.columns:
            weights = df['gen_weight'].values.astype('float64')/df['gen_weight'].values.astype('float64').sum()
        else:
            weights = None
        
        if self.use_bootstrap:
            bs_args = {'data': df['diff'], 'mean': self.return_mean, 'std': True, 'n':100}
            if self.use_weights and 'gen_weight' in df.columns: bs_args['weights'] = weights
            bs = bootstrap_stats(bs_args)
            return np.mean(bs['_mean']) if self.return_mean else np.mean(bs['_std'])
        else:
            if self.return_mean:
                return np.average(df['diff'], weights=weights)
            else:
                return DescrStatsW(df['diff'].values, ddof=1, weights=weights*len(weights) if weights is not None else None).std
            
    def evaluate(self, fy:FoldYielder, idx:int, y_pred:np.ndarray) -> float:
        r'''
        Compute statisitic on fold using provided predictions.

        Arguments:
            fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data
            idx: fold index corresponding to fold for which y_pred was computed
            y_pred: predictions for fold

        Returns:
            Statistic set in initialisation computed on the chsoen fold

        Examples::
            >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds)
        '''

        return self._compute(self.get_df(fy, idx, y_pred))


[docs]class RegPull(EvalMetric):
    r'''
    Compute mean or standard deviation of delta or pull of some feature which is being directly regressed to.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        name: optional name for metric, otherwise will be inferred from `use_pull`
        main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping
            Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted
    Examples::
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=True,
        ...                      use_pull=True)
        >>>
        >>> std_delta  = RegPull(return_mean=False, use_bootstrap=True,
        ...                      use_pull=False)
        >>>
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=False,
        ...                      use_pull=True, wgt_name='weights')
    '''

    # TODO: Check how this handels multi-target regression, may need to adjust averaging axis & DescrStatsW may not handle multi-dimensional data well.

    def __init__(self, return_mean:bool, use_bootstrap:bool=False, use_pull:bool=True, name:Optional[str]=None, main_metric:bool=True):
        if name is None:
            name = 'pull' if use_pull else 'delta'
        super().__init__(name=name, lower_metric_better=True, main_metric=main_metric)
        store_attr(but=['name', 'main_metric'])

    def _compute(self, preds:np.ndarray, targets:np.ndarray, weights:Optional[np.ndarray]=None) -> float:
        delta = preds-targets
        if self.use_pull: delta /= targets

        if weights is not None:
            weights = weights.astype('float64')
            weights = weights/weights.sum()
        
        if self.use_bootstrap:
            bs = bootstrap_stats({'data':delta, 'mean':True, 'std':True, 'n':100, 'weights':weights})
            return np.mean(bs['_mean']) if self.return_mean else np.mean(bs['_std'])
        else:
            if self.return_mean:
                return np.average(delta, weights=weights)
            else:
                return DescrStatsW(delta, ddof=1, weights=weights*len(weights) if weights is not None else None).std
            
[docs]    def evaluate(self) -> float:
        r'''
        Compute mean or width of regression error.

        Returns:
            Mean or width of regression error
        '''

        return self._compute(self.preds, self.targets, self.weights)


class OldRegAsProxyPull(OldRegPull):
    r'''
    Compute mean or standard deviation of delta or pull of some feature which is being indirectly regressed to via a proxy function.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        proxy_func: function which acts on regression predictions and adds pred and gen_target columns to the Pandas DataFrame it is passed which contains
            prediction columns pred_{i}
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_weights: whether to actually use weights if wgt_name is set
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        targ_name: name of group in fold file containing regression targets
        wgt_name: name of group in fold file containing datapoint weights

    Examples::
        >>> def reg_proxy_func(df):
        >>>     df['pred'] = calc_pair_mass(df, (1.77682, 1.77682),
        ...                                 {targ[targ.find('_t')+3:]:
        ...                                 f'pred_{i}' for i, targ
        ...                                 in enumerate(targ_feats)})
        >>>     df['gen_target'] = 125
        >>>    
        >>> std_delta = RegAsProxyPull(proxy_func=reg_proxy_func,
        ...                            return_mean=False, use_pull=False)
    
    .. Attention:: This class is depreciated in favour of :class:`~lumin.nn.metrics.reg_eval.RegAsProxyPull.
        It is a copy of the old `RegAsProxyPull` class used in lumin<=0.7.0.
        It will be removed in V0.8
    '''

    # XXX remove in V0.8

    def __init__(self, proxy_func:Callable[[pd.DataFrame],None], return_mean:bool, use_bootstrap:bool=False, use_weights:bool=True, 
                 use_pull:bool=True, targ_name:str='targets', wgt_name:Optional[str]=None):
        super().__init__(use_bootstrap=use_bootstrap, use_weights=use_weights, return_mean=return_mean, use_pull=use_pull, targ_name=targ_name,
                         wgt_name=wgt_name)
        self.proxy_func = proxy_func
            
    def evaluate(self, fy:FoldYielder, idx:int, y_pred:np.ndarray) -> float:
        r'''
        Compute statisitic on fold using provided predictions.

        Arguments:
            fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data
            idx: fold index corresponding to fold for which y_pred was computed
            y_pred: predictions for fold

        Returns:
            Statistic set in initialisation computed on the chsoen fold

        Examples::
            >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds)
        '''

        df = self.get_df(fy, idx, y_pred)
        self.proxy_func(df)
        return self._compute(df)


[docs]class RegAsProxyPull(RegPull):
    r'''
    Compute mean or standard deviation of delta or pull of some feature which is being indirectly regressed to via a proxy function.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        proxy_func: function which acts on regression predictions and adds pred and gen_target columns to the Pandas DataFrame it is passed which contains
            prediction columns pred_{i}
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_weights: whether to actually use weights if wgt_name is set
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        targ_name: optional name of group in fold file containing regression targets
        name: optional name for metric, otherwise will be inferred from `use_pull`
        main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping
            Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted
    
    Examples::
        >>> def reg_proxy_func(df):
        >>>     df['pred'] = calc_pair_mass(df, (1.77682, 1.77682),
        ...                                 {targ[targ.find('_t')+3:]:
        ...                                 f'pred_{i}' for i, targ
        ...                                 in enumerate(targ_feats)})
        >>>     df['gen_target'] = 125
        >>>    
        >>> std_delta = RegAsProxyPull(proxy_func=reg_proxy_func,
        ...                            return_mean=False, use_pull=False)
    '''

    def __init__(self, proxy_func:Callable[[pd.DataFrame],None], return_mean:bool, targ_name:Optional[str]=None, use_bootstrap:bool=False, 
                 use_pull:bool=True, name:Optional[str]=None, main_metric:bool=True):
        if name is None:
            name = 'pull' if use_pull else 'delta'
        super().__init__(use_bootstrap=use_bootstrap, return_mean=use_bootstrap, use_pull=use_pull,  main_metric=main_metric)
        store_attr(but=['use_bootstrap', 'use_bootstrap', 'use_pull', 'main_metric'])
            
[docs]    def evaluate(self) -> float:
        r'''
        Compute statisitic on fold using provided predictions.

        Arguments:
            fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data
            idx: fold index corresponding to fold for which y_pred was computed
            y_pred: predictions for fold

        Returns:
            Statistic set in initialisation computed on the chsoen fold

        Examples::
            >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds)
        '''

        df = self.get_df()
        self.proxy_func(df)
        return self._compute(df['pred'].values, df['gen_target'].values, df['gen_weight'].values if 'gen_weight' in df.columns else None)
Source code for lumin.nn.metrics.reg_eval

Docs

Tutorials