Shortcuts

Source code for lumin.nn.models.helpers

from pathlib import Path
from typing import List, Optional, Tuple, Union

import numpy as np

from ..data.fold_yielder import FoldYielder

__all__ = ["CatEmbedder"]


[docs]class CatEmbedder: r""" Helper class for embedding categorical features. Designed to be passed to :class:`~lumin.nn.models.model_builder.ModelBuilder`. Note that the classmethod :meth:`~lumin.nn.models.helpers.CatEmbedder.from_fy` may be used to instantiate an :class:`~lumin.nn.models.helpers.CatEmbedder` from a :class:`~lumin.nn.data.fold_yielder.FoldYielder`. Arguments: cat_names: list of names of catgorical features in order in which they will be passed as inputs columns cat_szs: list of cardinalities (number of unique elements) for each feature emb_szs: Optional list of embedding sizes for each feature. If None, will use min(max_emb_sz, (1+sz)//2) max_emb_sz: Maximum size of embedding if emb_szs is None emb_load_path: if not None, will cause :class:`~lumin.nn.models.model_builder.ModelBuilder` to attempt to load pretrained embeddings from path Examples:: >>> cat_embedder = CatEmbedder(cat_names=['n_jets', 'channel'], cat_szs=[5, 3]) >>> >>> cat_embedder = CatEmbedder(cat_names=['n_jets', 'channel'], cat_szs=[5, 3], emb_szs=[2, 2]) >>> >>> cat_embedder = CatEmbedder(cat_names=['n_jets', 'channel'], cat_szs=[5, 3], emb_szs=[2, 2], emb_load_path=Path('weights')) """ # TODO: load pretrained embeddings to check sizes def __init__( self, cat_names: List[str], cat_szs: List[int], emb_szs: Optional[List[int]] = None, max_emb_sz: int = 50, emb_load_path: Optional[Union[Path, str]] = None, ): assert len(cat_names) == len(cat_szs), "Different number of feature names and feature cardinalities received" if emb_szs is not None: assert len(cat_szs) == len(emb_szs), "Different number of features and embedding sizes received" self.cat_names, self.cat_szs, self.emb_szs, self.max_emb_sz, self.emb_load_path = ( cat_names, cat_szs, emb_szs, max_emb_sz, emb_load_path, ) if self.emb_szs is None: self.calc_emb_szs() if self.emb_load_path is not None and not isinstance(self.emb_load_path, Path): self.emb_load_path = Path(self.emb_load_path) self.n_cat_in = len(self.cat_szs) def __repr__(self) -> str: rep = "" for i in range(self.n_cat_in): rep += f"{self.cat_names[i]}:\t{self.cat_szs[i]} --> {self.emb_szs[i]}\n" if self.emb_load_path is not None: rep += f"\nLoading pretrained embeddings from: {self.emb_load_path}" return rep def __getitem__(self, key: Union[int, str]) -> Tuple[int, int]: if isinstance(key, int): return (self.cat_szs[key], self.emb_szs[key]) else: return self[self.cat_names.index(key)] def __iter__(self) -> Tuple[int, int]: for name, sz, emb_sz in zip(self.cat_names, self.cat_szs, self.emb_szs): yield name, sz, emb_sz
[docs] @classmethod def from_fy( cls, fy: FoldYielder, emb_szs: Optional[List[int]] = None, max_emb_sz: int = 50, emb_load_path: Optional[Union[Path, str]] = None, ): r""" Instantiate an :class:`~lumin.nn.models.helpers.CatEmbedder` from a :class:`~lumin.nn.data.fold_yielder.FoldYielder`, i.e. avoid having to pass cat_names and cat_szs. Arguments: fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` with training data emb_szs: Optional list of embedding sizes for each feature. If None, will use min(max_emb_sz, (1+sz)//2) max_emb_sz: Maximum size of embedding if emb_szs is None emb_load_path: if not None, will cause :class:`~lumin.nn.models.model_builder.ModelBuilder` to attempt to load pretrained embeddings from path Returns: :class:`~lumin.nn.models.helpers.CatEmbedder` Examples:: >>> cat_embedder = CatEmbedder.from_fy(train_fy) >>> >>> cat_embedder = CatEmbedder.from_fy(train_fy, emb_szs=[2, 2]) >>> >>> cat_embedder = CatEmbedder.from_fy( train_fy, emb_szs=[2, 2], emb_load_path=Path('weights')) """ cat_names = fy.get_use_cat_feats() cat_szs = None # Get cardinalities for fld_id in range(len(fy)): tmp_max = ( fy.get_df(n_folds=1, fold_idx=fld_id, inc_inputs=True, verbose=False, suppress_warn=True)[cat_names] .max() .values.astype(int) ) if cat_szs is None: cat_szs = tmp_max else: cat_szs = np.maximum(cat_szs, tmp_max) cat_szs = list(1 + cat_szs) # zero-ordered, therefore cardinality is 1+max return cls( cat_names=cat_names, cat_szs=cat_szs, emb_szs=emb_szs, max_emb_sz=max_emb_sz, emb_load_path=emb_load_path )
[docs] def calc_emb_szs(self) -> None: r""" Method used to set sizes of embeddings for each categorical feature when no embedding sizes are explicitly passed Uses rule of thumb of min(50, (1+cardinality)/2) """ self.emb_szs = [min(self.max_emb_sz, (1 + sz) // 2) for sz in self.cat_szs]

Docs

Access comprehensive developer and user documentation for LUMIN

View Docs

Tutorials

Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials