Source code for lumin.nn.metrics.reg_eval

from typing import Callable, Optional

import numpy as np
import pandas as pd
from fastcore.all import store_attr
from statsmodels.stats.weightstats import DescrStatsW

from ...utils.statistics import bootstrap_stats
from .eval_metric import EvalMetric

__all__ = ["RegPull", "RegAsProxyPull"]


[docs]class RegPull(EvalMetric):
    r"""
    Compute mean or standard deviation of delta or pull of some feature which is being directly regressed to.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        name: optional name for metric, otherwise will be inferred from `use_pull`
        main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping
            Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted
    Examples::
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=True,
        ...                      use_pull=True)
        >>>
        >>> std_delta  = RegPull(return_mean=False, use_bootstrap=True,
        ...                      use_pull=False)
        >>>
        >>> mean_pull  = RegPull(return_mean=True, use_bootstrap=False,
        ...                      use_pull=True, wgt_name='weights')
    """

    # TODO: Check how this handels multi-target regression, may need to adjust averaging axis & DescrStatsW may not handle multi-dimensional data well.

    def __init__(
        self,
        return_mean: bool,
        use_bootstrap: bool = False,
        use_pull: bool = True,
        name: Optional[str] = None,
        main_metric: bool = True,
    ):
        if name is None:
            name = "pull" if use_pull else "delta"
        super().__init__(name=name, lower_metric_better=True, main_metric=main_metric)
        store_attr(but=["name", "main_metric"])

    def _compute(self, preds: np.ndarray, targets: np.ndarray, weights: Optional[np.ndarray] = None) -> float:
        delta = preds - targets
        if self.use_pull:
            delta /= targets

        if weights is not None:
            weights = weights.astype("float64")
            weights = weights / weights.sum()

        if self.use_bootstrap:
            bs = bootstrap_stats({"data": delta, "mean": True, "std": True, "n": 100, "weights": weights})
            return np.mean(bs["_mean"]) if self.return_mean else np.mean(bs["_std"])
        else:
            if self.return_mean:
                return np.average(delta, weights=weights)
            else:
                return DescrStatsW(delta, ddof=1, weights=weights * len(weights) if weights is not None else None).std

[docs]    def evaluate(self) -> float:
        r"""
        Compute mean or width of regression error.

        Returns:
            Mean or width of regression error
        """

        return self._compute(self.preds, self.targets, self.weights)


[docs]class RegAsProxyPull(RegPull):
    r"""
    Compute mean or standard deviation of delta or pull of some feature which is being indirectly regressed to via a proxy function.
    Optionally, use bootstrap resampling on validation data.

    Arguments:
        proxy_func: function which acts on regression predictions and adds pred and gen_target columns to the Pandas DataFrame it is passed which contains
            prediction columns pred_{i}
        return_mean: whether to return the mean or the standard deviation
        use_bootstrap: whether to bootstrap resamples validation fold when computing statisitic
        use_weights: whether to actually use weights if wgt_name is set
        use_pull: whether to return the pull (differences / targets) or delta (differences)
        targ_name: optional name of group in fold file containing regression targets
        name: optional name for metric, otherwise will be inferred from `use_pull`
        main_metric: whether this metic should be treated as the primary metric for SaveBest and EarlyStopping
            Will automatically set the first EvalMetric to be main if multiple primary metrics are submitted

    Examples::
        >>> def reg_proxy_func(df):
        >>>     df['pred'] = calc_pair_mass(df, (1.77682, 1.77682),
        ...                                 {targ[targ.find('_t')+3:]:
        ...                                 f'pred_{i}' for i, targ
        ...                                 in enumerate(targ_feats)})
        >>>     df['gen_target'] = 125
        >>>
        >>> std_delta = RegAsProxyPull(proxy_func=reg_proxy_func,
        ...                            return_mean=False, use_pull=False)
    """

    def __init__(
        self,
        proxy_func: Callable[[pd.DataFrame], None],
        return_mean: bool,
        targ_name: Optional[str] = None,
        use_bootstrap: bool = False,
        use_pull: bool = True,
        name: Optional[str] = None,
        main_metric: bool = True,
    ):
        if name is None:
            name = "pull" if use_pull else "delta"
        super().__init__(
            use_bootstrap=use_bootstrap, return_mean=use_bootstrap, use_pull=use_pull, main_metric=main_metric
        )
        store_attr(but=["use_bootstrap", "use_bootstrap", "use_pull", "main_metric"])

[docs]    def evaluate(self) -> float:
        r"""
        Compute statisitic on fold using provided predictions.

        Arguments:
            fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` interfacing to data
            idx: fold index corresponding to fold for which y_pred was computed
            y_pred: predictions for fold

        Returns:
            Statistic set in initialisation computed on the chsoen fold

        Examples::
            >>> mean = mean_pull.evaluate(train_fy, val_id, val_preds)
        """

        df = self.get_df()
        self.proxy_func(df)
        return self._compute(
            df["pred"].values, df["gen_target"].values, df["gen_weight"].values if "gen_weight" in df.columns else None
        )
Source code for lumin.nn.metrics.reg_eval

Docs

Tutorials