Source code for lumin.optimisation.features

import pandas as pd
import numpy as np
from typing import List, Optional
from fastprogress import progress_bar
from rfpimp import importances

from sklearn.ensemble.forest import ForestRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from .hyper_param import get_opt_rf_params
from ..plotting.interpretation import plot_importance
from ..plotting.plot_settings import PlotSettings

__all__ = ['get_rf_feat_importance', 'rf_rank_features']

[docs]def get_rf_feat_importance(rf:ForestRegressor, inputs:pd.DataFrame, targets:np.ndarray, weights:Optional[np.ndarray]=None) -> pd.DataFrame: r''' Compute feature importance for a Random Forest model using rfpimp. Arguments: rf: trained Random Forest model inputs: input data as Pandas DataFrame targets: target data as Numpy array weights: Optional data weights as Numpy array ''' return importances(rf, inputs, targets, features=inputs.columns, sample_weights=weights).reset_index()
[docs]def rf_rank_features(train_df:pd.DataFrame, val_df:pd.DataFrame, objective:str, train_feats:List[str], targ_name:str='gen_target', wgt_name:Optional[str]=None, importance_cut:float=0.0, n_estimators:int=40, n_rfs:int=1, savename:Optional[str]=None, plot_settings:PlotSettings=PlotSettings()) -> List[str]: r''' Compute relative permutation importance of input features via using Random Forests. A reduced set of 'important features' is obtained by cutting on relative importance and a new model is trained and evaluated on this reduced set. RFs will have their hyper-parameters roughly optimised, both when training on all features and once when training on important features. Relative importances may be computed multiple times (via n_rfs) and averaged. In which case the standard error is also computed. Arguments: train_df: training data as Pandas DataFrame val_df: validation data as Pandas DataFrame objective: string representation of objective: either 'classification' or 'regression' train_feats: complete list of training features targ_name: name of column containing target data wgt_name: name of column containing weight data. If set, will use weights for training and evaluation, otherwise will not importance_cut: minimum importance required to be considered an 'important feature' n_estimators: number of trees to use in each forest n_rfs: number of trainings to perform on all training features in order to compute importances savename: Optional name of file to which to save the plot of feature importances plot_settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance Returns: List of features passing importance_cut, ordered by importance ''' w_trn = None if wgt_name is None else train_df[wgt_name] w_val = None if wgt_name is None else val_df[wgt_name] print("Optimising RF") opt_params, rf = get_opt_rf_params(train_df[train_feats], train_df[targ_name], val_df[train_feats], val_df[targ_name], objective, w_trn=w_trn, w_val=w_val, verbose=False) print("Evalualting importances") fi = get_rf_feat_importance(rf, train_df[train_feats], train_df[targ_name], w_trn) orig_score = rf.score(val_df[train_feats], val_df[targ_name], w_val) if n_rfs > 1: m = RandomForestClassifier if 'class' in objective.lower() else RandomForestRegressor for _ in progress_bar(range(n_rfs-1)): rf = m(**opt_params)[train_feats], train_df[targ_name], w_trn) fi = pd.merge(fi, get_rf_feat_importance(rf, train_df[train_feats], train_df[targ_name], w_trn), on='Feature', how='left') orig_score += rf.score(val_df[train_feats], val_df[targ_name], w_val) fi['Importance'] = np.mean(fi[[f for f in fi.columns if 'Importance' in f]].values, axis=1) fi['Uncertainty'] = np.std(fi[[f for f in fi.columns if 'Importance' in f]].values, ddof=1, axis=1)/np.sqrt(n_rfs) orig_score /= n_rfs fi.sort_values(by='Importance', ascending=False, inplace=True) print("Top ten most important features:\n", fi[['Feature', 'Importance']][:min(len(fi), 10)]) plot_importance(fi[:min(len(fi), 30)], savename=savename, settings=plot_settings) top_feats = list(fi[fi.Importance > importance_cut].Feature) print(f"\n{len(top_feats)} features found with importance greater than {importance_cut}:\n", top_feats, '\n') if len(top_feats) == 0: print(f"Model score: :\t{orig_score:.5f}") print('No features found to be important, returning all training features. Good luck.') return train_feats if len(top_feats) < len(train_feats): print("\nOptimising new RF") _, rf_new = get_opt_rf_params(train_df[top_feats], train_df[targ_name], val_df[top_feats], val_df[targ_name], objective, w_trn=w_trn, w_val=w_val, n_estimators=n_estimators, verbose=False) print("Comparing RF scores, higher = better") print(f"All features:\t{orig_score:.5f}") print(f"Top features:\t{rf_new.score(val_df[top_feats], val_df[targ_name], w_val):.5f}") else: print('All training features found to be important') return top_feats
Read the Docs v: v0.3.1
On Read the Docs
Project Home

Free document hosting provided by Read the Docs.


Access comprehensive developer and user documentation for LUMIN

View Docs


Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials