Source code for lumin.nn.data.batch_yielder

import numpy as np
from typing import List, Optional

from ...utils.misc import to_device

from torch.tensor import Tensor

__all__ = ['BatchYielder']


'''
Todo
- Improve this/change to dataloader
'''


[docs]class BatchYielder:
    r'''
    Yields minibatches to model during training. Iteration provides one minibatch as tuple of tensors of inputs, targets, and weights.
    
    Arguments:
        inputs: input array for (sub-)epoch
        targets: targte array for (sub-)epoch
        bs: batchsize, number of data to include per minibatch
        objective: 'classification', 'multiclass classification', or 'regression'. Used for casting target dtype.
        weights: Optional weight array for (sub-)epoch
        shuffle: whether to shuffle the data at the beginning of an iteration
        use_weights: if passed weights, whether to actually pass them to the model
        bulk_move: whether to move all data to device at once. Default is true (saves time), but if device has low memory you can set to False.
    '''

    def __init__(self, inputs:np.ndarray, targets:np.ndarray, bs:int, objective:str,
                 weights:Optional[np.ndarray]=None, shuffle=True, use_weights:bool=True, bulk_move=True):
        self.inputs,self.targets,self.weights,self.bs,self.objective,self.shuffle,self.use_weights,self.bulk_move = \
            inputs,targets,weights,bs,objective,shuffle,use_weights,bulk_move

    def __iter__(self) -> List[Tensor]:
        r'''
        Iterate through data in batches.

        Returns:
            tuple of batches of inputs, targets, and weights as tensors on device
        '''

        if self.shuffle:
            if self.weights is not None and self.use_weights:
                data = list(zip(self.inputs, self.targets, self.weights))
                np.random.shuffle(data)
                inputs, targets, weights = zip(*data)
            else:
                data = list(zip(self.inputs, self.targets))
                np.random.shuffle(data)
                inputs, targets = zip(*data)
        else:
            inputs, targets, weights = self.inputs, self.targets, self.weights

        if self.bulk_move:
            inputs = to_device(Tensor(inputs))
            if 'multiclass' in self.objective: targets = to_device(Tensor(targets).long().squeeze())
            else:                              targets = to_device(Tensor(targets))
            if self.weights is not None and self.use_weights: weights = to_device(Tensor(weights))
            else:                                             weights = None
            
            for i in range(0, len(inputs)-self.bs+1, self.bs):
                if weights is None: yield inputs[i:i+self.bs], targets[i:i+self.bs], None
                else:               yield inputs[i:i+self.bs], targets[i:i+self.bs], weights[i:i+self.bs]                    

        else:
            for i in range(0, len(inputs)-self.bs+1, self.bs):
                if 'multiclass' in self.objective: y = Tensor(targets[i:i+self.bs]).long().squeeze()
                else:                              y = Tensor(targets[i:i+self.bs])
                if self.weights is not None and self.use_weights:
                    yield to_device(Tensor(inputs[i:i+self.bs])), to_device(y), to_device(Tensor(weights[i:i+self.bs]))
                else:
                    yield to_device(Tensor(inputs[i:i+self.bs])), to_device(y), None

    def __len__(self): return len(self.inputs)//self.bs
Source code for lumin.nn.data.batch_yielder

Docs

Tutorials