Source code for lumin.nn.callbacks.data_callbacks

from abc import ABCMeta, abstractmethod
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from fastcore.all import is_listy, store_attr
from torch import Tensor

from ..data.batch_yielder import BatchYielder
from .callback import Callback

__all__ = ["BinaryLabelSmooth", "BootstrapResample", "ParametrisedPrediction", "TargReplace"]


[docs]class BinaryLabelSmooth(Callback):
    r"""
    Callback for applying label smoothing to binary classes, based on https://arxiv.org/abs/1512.00567
    Applies smoothing during both training.

    Arguments:
        coefs: Smoothing coefficients: 0->coef[0] 1->1-coef[1]. if passed float, coef[0]=coef[1]

    Examples::
        >>> lbl_smooth = BinaryLabelSmooth(0.1)
        >>>
        >>> lbl_smooth = BinaryLabelSmooth((0.1, 0.02))
    """

    def __init__(self, coefs: Union[float, Tuple[float, float]] = 0):
        super().__init__()
        self.coefs = coefs if isinstance(coefs, tuple) else (coefs, coefs)

[docs]    def on_fold_begin(self) -> None:
        r"""
        Apply smoothing
        """

        if self.model.fit_params.state != "train":
            return
        self.model.fit_params.by.targets = self.model.fit_params.by.targets.astype(float)
        m = self.model.fit_params.self.model.fit_params.by.targets == 0
        self.model.fit_params.by.targets[m] = self.coefs[0]
        self.model.fit_params.by.targets[m] = 1 - self.coefs[1]


[docs]class BootstrapResample(Callback):
    r"""
    Callback for bootstrap sampling new training datasets from original training data during (ensemble) training.

    Arguments:
        n_folds: the number of folds present in training :class:`~lumin.nn.data.fold_yielder.FoldYielder`
        bag_each_time: whether to sample a new set for each sub-epoch or to use the same sample each time
        reweight: whether to reweight the sampleed data to mathch the weight sum (per class) of the original data

    Examples::
        >>> bs_resample BootstrapResample(n_folds=len(train_fy))
    """

    def __init__(self, n_folds: int, bag_each_time: bool = False, reweight: bool = True):
        super().__init__()
        self.n_trn_flds, self.bag_each_time, self.reweight = n_folds - 1, bag_each_time, reweight

    def _get_sample(self, length: int) -> np.ndarray:
        return np.random.choice(range(length), length, replace=True)

    def _resample(self, sample: np.ndarray, by: BatchYielder) -> None:
        # Get weight sums before resampling
        if by.weights is not None and self.reweight:
            if "class" in self.model.objective:
                weight_sum = {}
                for c in torch.unique(by.targets.squeeze()):
                    weight_sum[c] = torch.sum(by.weights[by.targets.squeeze() == c])
            else:
                weight_sum = torch.sum(by.weights)

        # Resample
        by.inputs[...] = by.inputs[sample]
        by.targets[...] = by.targets[sample]
        if by.weights is not None:
            by.weights[...] = by.weights[sample]

            # Reweight
            if self.reweight:
                if "class" in self.model.objective:
                    for c in weight_sum:
                        by.weights[by.targets.squeeze() == c] *= weight_sum[c] / torch.sum(
                            by.weights[by.targets.squeeze() == c]
                        )
                else:
                    by.weights *= weight_sum / torch.sum(by.weights)

[docs]    def on_train_begin(self) -> None:
        r"""
        Resets internal parameters to prepare for a new training
        """

        super().on_train_begin()
        self.iter, self.samples = 0, []
        np.random.seed()  # Is this necessary?

[docs]    def on_fold_begin(self) -> None:
        r"""
        Resamples training data for new epoch
        """

        if self.model.fit_params.state != "train":
            return
        if self.bag_each_time or self.iter < self.n_trn_flds:
            sample = self._get_sample(len(self.model.fit_params.by.targets))
            if not self.bag_each_time:
                self.samples.append(sample)
        else:
            sample = self.samples[self.iter % self.n_trn_flds]
        self.iter += 1
        self._resample(sample, self.model.fit_params.by)


[docs]class ParametrisedPrediction(Callback):
    r"""
    Callback for running predictions for a parametersied network (https://arxiv.org/abs/1601.07913); one which has been trained using one of more inputs which
    represent e.g. different hypotheses for the classes such as an unknown mass of some new particle.
    In such a scenario, multiple signal datasets could be used for training, with background receiving a random mass. During prediction one then needs to set
    these parametrisation features all to the same values to evaluat the model's response for that hypothesis.
    This callback can be passed to the predict method of the model/ensemble to adjust the parametrisation features to the desired values.

    Arguments:
        feats: list of feature names used during training (in the same order)
        param_feat: the feature name which is to be adjusted, or a list of features to adjust
        param_val: the value to which to set the paramertisation feature, of the list of values to set the parameterisation features to

    Examples::
        >>> mass_param = ParametrisedPrediction(train_feats, 'res_mass', 300)
        >>> model.predict(fold_yeilder, pred_name=f'pred_mass_300', callbacks=[mass_param])
        >>>
        >>> mass_param = ParametrisedPrediction(train_feats, 'res_mass', 300)
        >>> spin_param = ParametrisedPrediction(train_feats, 'spin', 1)
        >>> model.predict(fold_yeilder, pred_name=f'pred_mass_300', callbacks=[mass_param, spin_param])

    """

    def __init__(self, feats: List[str], param_feat: Union[List[str], str], param_val: Union[List[float], float]):
        super().__init__()
        if not is_listy(param_feat):
            param_feat = [param_feat]
        if not is_listy(param_val):
            param_val = [param_val]
        self.param_val = list(param_val)
        self.param_idx = [feats.index(f) for f in param_feat]

[docs]    def on_pred_begin(self) -> None:
        r"""
        Adjusts the data to be passed to the model by setting in place the parameterisation feature to the preset value
        """

        for f, v in zip(self.param_idx, self.param_val):
            self.model.fit_params.by.inputs[:, f] = v


[docs]class TargReplace(Callback):
    r"""
    Callback to replace target data with requested data from foldfile, allowing one to e.g. train two models simultaneously with the same inputs but different targets for e.g. adversarial training.
    At the end of validation epochs, the target data is swapped back to the original target data, to allow for the correct computation of any metrics

    Arguments:
        targ_feats: list of column names in foldfile to get and horizontally stack to replace target data in current :class:`~lumin.nn.data.batch_yielder.BatchYielder`

    Examples::
        >>> targ_replace = TargReplace(['is_fake'])
        >>> targ_replace = TargReplace(['class', 'is_fake'])

    """

    def __init__(self, targ_feats: List[str]):
        store_attr()
        super().__init__()
        if not is_listy(self.targ_feats):
            self.targ_feats = list(self.targ_feats)

[docs]    def on_fold_begin(self) -> None:
        r"""
        Stack new target datasets and replace in target data in current :class:`~lumin.nn.data.batch_yielder.BatchYielder`
        """

        targs = []
        idx = self.model.fit_params.trn_idx if self.model.fit_params.state == "train" else self.model.fit_params.val_idx
        for t in self.targ_feats:
            targs.append(self.model.fit_params.fy.get_column(t, n_folds=1, fold_idx=idx, add_newaxis=True))
        self.model.fit_params.by.targets = np.hstack(targs)

[docs]    def on_epoch_end(self) -> None:
        r"""
        Swap target data back at the end of validation epochs
        """

        if self.model.fit_params.state != "valid":
            return
        self.model.fit_params.by.targets = self.model.fit_params.fy.get_column(
            "targets", n_folds=1, fold_idx=self.model.fit_params.val_idx, add_newaxis=True
        )


class AbsWeightData(Callback, metaclass=ABCMeta):
    r"""
    Callback to weight folds of data accoridng to a function of the inputs or targets.
    Inherit and override the `weight_func` method according to your task.

    Arguments:
        on_eval: if true, also weight data during validation and testing
    """

    def __init__(self, on_eval: bool):
        super().__init__()
        self.on_eval = on_eval

    @abstractmethod
    def weight_func(
        self,
        x: Union[np.ndarray, Tensor],
        mx: Optional[Union[np.ndarray, Tensor]],
        y: Union[np.ndarray, Tensor],
        w: Union[np.ndarray, Tensor],
    ) -> Union[np.ndarray, Tensor]:
        pass

    def on_fold_begin(self) -> None:
        r"""
        Weight all data in fold.
        """

        if self.model.fit_params.state != "train" and not self.on_eval:
            return
        self.model.fit_params.by.weights = self.weight_func(
            x=self.model.fit_params.by.inputs,
            mx=self.model.fit_params.by.matrix_inputs,
            y=self.model.fit_params.by.targets,
            w=self.model.fit_params.by.weights,
        )
Source code for lumin.nn.callbacks.data_callbacks

Docs

Tutorials