Shortcuts

Source code for lumin.nn.metrics.reg_eval

import numpy as np
from typing import Optional, Callable
import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW

from ...utils.statistics import bootstrap_stats
from .eval_metric import EvalMetric
from ..data.fold_yielder import FoldYielder

__all__ = ['RegPull', 'RegAsProxyPull']


[docs]class RegPull(EvalMetric): r''' Compute mean or standard deviation of delta or pull of some feature which is being directly regressed to. Optionally, use bootstrap resampling on validation data. Arguments: return_mean: whether to return the mean or the standard deviation use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic use_weights: whether to actually use weights if wgt_name is set use_pull: whether to return the pull (differences / targets) or delta (differences) targ_name: name of group in fold file containing regression targets wgt_name: name of group in fold file containing datapoint weights Examples:: >>> mean_pull = RegPull(return_mean=True, use_bootstrap=True, ... use_pull=True) >>> >>> std_delta = RegPull(return_mean=False, use_bootstrap=True, ... use_pull=False) >>> >>> mean_pull = RegPull(return_mean=True, use_bootstrap=False, ... use_pull=True, wgt_name='weights') ''' # TODO: Check how this handels multi-target regression, may need to adjust averaging axis & DescrStatsW may not handle multi-dimensional data well. # TODO: Remove use_weights and rely on whether wgt_name is None def __init__(self, return_mean:bool, use_bootstrap:bool=False, use_weights:bool=True, use_pull:bool=True, targ_name:str='targets', wgt_name:Optional[str]=None): super().__init__(targ_name=targ_name, wgt_name=wgt_name) self.use_bootstrap,self.use_weights,self.return_mean,self.use_pull = use_bootstrap,use_weights,return_mean,use_pull def _compute(self, df:pd.DataFrame) -> float: df['diff'] = df['pred']-df['gen_target'] if self.use_pull: df['diff'] /= df['gen_target'] if self.use_weights and 'gen_weight' in df.columns: weights = df['gen_weight'].values.astype('float64')/df['gen_weight'].values.astype('float64').sum() else: weights = None if self.use_bootstrap: bs_args = {'data': df['diff'], 'mean': self.return_mean, 'std': True, 'n':100} if self.use_weights and 'gen_weight' in df.columns: bs_args['weights'] = weights bs = bootstrap_stats(bs_args) return np.mean(bs['_mean']) if self.return_mean else np.mean(bs['_std']) else: if self.return_mean: return np.average(df['diff'], weights=weights) else: return DescrStatsW(df['diff'].values, ddof=1, weights=weights*len(weights) if weights is not None else None).std
[docs] def evaluate(self, fy:FoldYielder, idx:int, y_pred:np.ndarray) -> float: r''' Compute statisitic on fold using provided predictions. Arguments: fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data idx: fold index corresponding to fold for which y_pred was computed y_pred: predictions for fold Returns: Statistic set in initialisation computed on the chsoen fold Examples:: >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds) ''' return self._compute(self.get_df(fy, idx, y_pred))
[docs]class RegAsProxyPull(RegPull): r''' Compute mean or standard deviation of delta or pull of some feature which is being indirectly regressed to via a proxy function. Optionally, use bootstrap resampling on validation data. Arguments: proxy_func: function which acts on regression predictions and adds pred and gen_target columns to the Pandas DataFrame it is passed which contains prediction columns pred_{i} return_mean: whether to return the mean or the standard deviation use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic use_weights: whether to actually use weights if wgt_name is set use_pull: whether to return the pull (differences / targets) or delta (differences) targ_name: name of group in fold file containing regression targets wgt_name: name of group in fold file containing datapoint weights Examples:: >>> def reg_proxy_func(df): >>> df['pred'] = calc_pair_mass(df, (1.77682, 1.77682), ... {targ[targ.find('_t')+3:]: ... f'pred_{i}' for i, targ ... in enumerate(targ_feats)}) >>> df['gen_target'] = 125 >>> >>> std_delta = RegAsProxyPull(proxy_func=reg_proxy_func, ... return_mean=False, use_pull=False) ''' def __init__(self, proxy_func:Callable[[pd.DataFrame],None], return_mean:bool, use_bootstrap:bool=False, use_weights:bool=True, use_pull:bool=True, targ_name:str='targets', wgt_name:Optional[str]=None): super().__init__(use_bootstrap=use_bootstrap, use_weights=use_weights, return_mean=return_mean, use_pull=use_pull, targ_name=targ_name, wgt_name=wgt_name) self.proxy_func = proxy_func
[docs] def evaluate(self, fy:FoldYielder, idx:int, y_pred:np.ndarray) -> float: r''' Compute statisitic on fold using provided predictions. Arguments: fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data idx: fold index corresponding to fold for which y_pred was computed y_pred: predictions for fold Returns: Statistic set in initialisation computed on the chsoen fold Examples:: >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds) ''' df = self.get_df(fy, idx, y_pred) self.proxy_func(df) return self._compute(df)
Read the Docs v: v0.4.0.1
Versions
latest
stable
v0.4.0.1
v0.3.1
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.

Docs

Access comprehensive developer and user documentation for LUMIN

View Docs

Tutorials

Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials