Shortcuts

Source code for lumin.nn.metrics.reg_eval

from typing import Callable, Optional

import numpy as np
import pandas as pd
from fastcore.all import store_attr
from statsmodels.stats.weightstats import DescrStatsW

from ...utils.statistics import bootstrap_stats
from .eval_metric import EvalMetric

__all__ = ["RegPull", "RegAsProxyPull"]


[docs]class RegPull(EvalMetric): r""" Compute mean or standard deviation of delta or pull of some feature which is being directly regressed to. Optionally, use bootstrap resampling on validation data. Arguments: return_mean: whether to return the mean or the standard deviation use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic use_pull: whether to return the pull (differences / targets) or delta (differences) name: optional name for metric, otherwise will be inferred from `use_pull` main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted Examples:: >>> mean_pull = RegPull(return_mean=True, use_bootstrap=True, ... use_pull=True) >>> >>> std_delta = RegPull(return_mean=False, use_bootstrap=True, ... use_pull=False) >>> >>> mean_pull = RegPull(return_mean=True, use_bootstrap=False, ... use_pull=True, wgt_name='weights') """ # TODO: Check how this handels multi-target regression, may need to adjust averaging axis & DescrStatsW may not handle multi-dimensional data well. def __init__( self, return_mean: bool, use_bootstrap: bool = False, use_pull: bool = True, name: Optional[str] = None, main_metric: bool = True, ): if name is None: name = "pull" if use_pull else "delta" super().__init__(name=name, lower_metric_better=True, main_metric=main_metric) store_attr(but=["name", "main_metric"]) def _compute(self, preds: np.ndarray, targets: np.ndarray, weights: Optional[np.ndarray] = None) -> float: delta = preds - targets if self.use_pull: delta /= targets if weights is not None: weights = weights.astype("float64") weights = weights / weights.sum() if self.use_bootstrap: bs = bootstrap_stats({"data": delta, "mean": True, "std": True, "n": 100, "weights": weights}) return np.mean(bs["_mean"]) if self.return_mean else np.mean(bs["_std"]) else: if self.return_mean: return np.average(delta, weights=weights) else: return DescrStatsW(delta, ddof=1, weights=weights * len(weights) if weights is not None else None).std
[docs] def evaluate(self) -> float: r""" Compute mean or width of regression error. Returns: Mean or width of regression error """ return self._compute(self.preds, self.targets, self.weights)
[docs]class RegAsProxyPull(RegPull): r""" Compute mean or standard deviation of delta or pull of some feature which is being indirectly regressed to via a proxy function. Optionally, use bootstrap resampling on validation data. Arguments: proxy_func: function which acts on regression predictions and adds pred and gen_target columns to the Pandas DataFrame it is passed which contains prediction columns pred_{i} return_mean: whether to return the mean or the standard deviation use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic use_weights: whether to actually use weights if wgt_name is set use_pull: whether to return the pull (differences / targets) or delta (differences) targ_name: optional name of group in fold file containing regression targets name: optional name for metric, otherwise will be inferred from `use_pull` main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted Examples:: >>> def reg_proxy_func(df): >>> df['pred'] = calc_pair_mass(df, (1.77682, 1.77682), ... {targ[targ.find('_t')+3:]: ... f'pred_{i}' for i, targ ... in enumerate(targ_feats)}) >>> df['gen_target'] = 125 >>> >>> std_delta = RegAsProxyPull(proxy_func=reg_proxy_func, ... return_mean=False, use_pull=False) """ def __init__( self, proxy_func: Callable[[pd.DataFrame], None], return_mean: bool, targ_name: Optional[str] = None, use_bootstrap: bool = False, use_pull: bool = True, name: Optional[str] = None, main_metric: bool = True, ): if name is None: name = "pull" if use_pull else "delta" super().__init__( use_bootstrap=use_bootstrap, return_mean=use_bootstrap, use_pull=use_pull, main_metric=main_metric ) store_attr(but=["use_bootstrap", "use_bootstrap", "use_pull", "main_metric"])
[docs] def evaluate(self) -> float: r""" Compute statisitic on fold using provided predictions. Arguments: fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data idx: fold index corresponding to fold for which y_pred was computed y_pred: predictions for fold Returns: Statistic set in initialisation computed on the chsoen fold Examples:: >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds) """ df = self.get_df() self.proxy_func(df) return self._compute( df["pred"].values, df["gen_target"].values, df["gen_weight"].values if "gen_weight" in df.columns else None )

Docs

Access comprehensive developer and user documentation for LUMIN

View Docs

Tutorials

Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials