Source code for lumin.optimisation.hyper_param
import timeit
from collections import OrderedDict
from functools import partial
from typing import Dict, List, Optional, Tuple, Union
import matplotlib.pyplot as plt
import numpy as np
from fastcore.all import is_listy
from fastprogress import master_bar, progress_bar
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from ..nn.callbacks.opt_callbacks import LRFinder
from ..nn.data.fold_yielder import FoldYielder
from ..nn.models.model import Model
from ..nn.models.model_builder import ModelBuilder
from ..plotting.plot_settings import PlotSettings
from ..plotting.training import plot_lr_finders
__all__ = ["get_opt_rf_params", "lr_find"]
[docs]def get_opt_rf_params(
x_trn: np.ndarray,
y_trn: np.ndarray,
x_val: np.ndarray,
y_val: np.ndarray,
objective: str,
w_trn: Optional[np.ndarray] = None,
w_val: Optional[np.ndarray] = None,
params: Optional[OrderedDict] = None,
n_estimators: int = 40,
verbose=True,
) -> Tuple[Dict[str, float], Union[RandomForestRegressor, RandomForestClassifier]]:
r"""
Use an ordered parameter-scan to roughly optimise Random Forest hyper-parameters.
Arguments:
x_trn: training input data
y_trn: training target data
x_val: validation input data
y_val: validation target data
objective: string representation of objective: either 'classification' or 'regression'
w_trn: training weights
w_val: validation weights
params: ordered dictionary mapping parameters to optimise to list of values to cosnider
n_estimators: number of trees to use in each forest
verbose: Print extra information and show a live plot of model performance
Returns:
params: dictionary mapping parameters to their optimised values
rf: best performing Random Forest
"""
if params is None:
params = OrderedDict({"min_samples_leaf": [1, 3, 5, 10, 25, 50, 100], "max_features": [0.3, 0.5, 0.7, 0.9]})
rf = RandomForestClassifier if "class" in objective.lower() else RandomForestRegressor
best_params = {"n_estimators": n_estimators, "n_jobs": -1, "max_features": "sqrt"}
best_scores = []
scores = []
mb = master_bar(params)
mb.names = ["Best", "Scores"]
if verbose:
mb.update_graph([[[], []], [[], []]])
for param in mb:
pb = progress_bar(params[param], parent=mb)
pb.comment = f"{param} = {params[param][0]}"
for i, value in enumerate(pb):
pb.comment = f"{param} = {params[param][min(i+1, len(params[param])-1)]}"
m = rf(**{**best_params, param: value})
m.fit(X=x_trn, y=y_trn, sample_weight=w_trn)
scores.append(m.score(X=x_val, y=y_val, sample_weight=w_val))
if len(best_scores) == 0 or scores[-1] > best_scores[-1]:
best_scores.append(scores[-1])
best_params[param] = value
if verbose:
print(f"Better score schieved: {param} @ {value} = {best_scores[-1]:.4f}")
best_m = m
else:
best_scores.append(best_scores[-1])
if verbose:
mb.update_graph([[range(len(best_scores)), best_scores], [range(len(scores)), scores]])
if verbose:
delattr(mb, "fig")
if verbose:
plt.clf()
return best_params, best_m
[docs]def lr_find(
fy: FoldYielder,
model_builder: ModelBuilder,
bs: int,
n_epochs: int = 1,
train_on_weights: bool = True,
n_repeats: int = -1,
lr_bounds: Tuple[float, float] = [1e-5, 10],
cb_partials: Optional[List[partial]] = None,
plot_settings: PlotSettings = PlotSettings(),
bulk_move: bool = True,
plot_savename: Optional[str] = None,
show_plot: bool = True,
) -> List[LRFinder]:
r"""
Wrapper function for training using :class:`~lumin.nn.callbacks.opt_callbacks.LRFinder` which runs a Smith LR range test (https://arxiv.org/abs/1803.09820)
using folds in :class:`~lumin.nn.data.fold_yielder.FoldYielder`.
Trains models for a set number of repeats, interpolating LR between set bounds. This repeats for each fold in :class:`~lumin.nn.data.fold_yielder.FoldYielder`,
and loss evolution is averaged.
Arguments:
fy: :class:`~lumin.nn.data.fold_yielder.FoldYielder` providing training data
model_builder: :class:`~lumin.nn.models.model_builder.ModelBuilder` providing networks and optimisers
bs: batch size
n_epochs: number of epochs to train per fold
train_on_weights: If weights are present, whether to use them for training
shuffle_fold: whether to shuffle data in folds
n_folds: if >= 1, will only train n_folds number of models, otherwise will train one model per fold
lr_bounds: starting and ending LR values
cb_partials: optional list of functools.partial, each of which will a instantiate :class:`~lumin.nn.callbacks.callback.Callback` when called
plot_settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance
savename: Optional name of file to which to save the plot
show_plot: whether to show the plot, or just save them
Returns:
List of :class:`~lumin.nn.callbacks.opt_callbacks.LRFinder` which were used for each model trained
"""
if cb_partials is None:
cb_partials = []
if not is_listy(cb_partials):
cb_partials = [cb_partials]
nb = n_epochs * (fy.n_folds - 1) * fy.get_data_count(0) // bs
lr_finders = []
tmr = timeit.default_timer()
mb = master_bar(range(fy.n_folds) if n_repeats < 1 else range(min(n_repeats, fy.n_folds)))
for idx in mb:
model = Model(model_builder)
cbs = []
for c in cb_partials:
cbs.append(c())
lrf = LRFinder(lr_bounds=lr_bounds, nb=nb)
trn_idxs = list(range(fy.n_folds))
trn_idxs.remove(idx)
model.fit(
n_epochs=n_epochs,
fy=fy,
bs=bs,
bulk_move=bulk_move,
train_on_weights=train_on_weights,
trn_idxs=trn_idxs,
cbs=cbs + [lrf],
model_bar=mb,
)
lr_finders.append(lrf)
del model
print("LR finder took {:.3f}s ".format(timeit.default_timer() - tmr))
plot_lr_finders(
lr_finders,
loss_range="auto",
settings=plot_settings,
log_y="auto" if "regress" in model_builder.objective.lower() else False,
savename=plot_savename,
show_plot=show_plot,
)
return lr_finders