Source code for lumin.optimisation.features

import pandas as pd
import numpy as np
from typing import List, Optional
from fastprogress import progress_bar
from rfpimp import importances

from sklearn.ensemble.forest import ForestRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from .hyper_param import get_opt_rf_params
from ..plotting.interpretation import plot_importance
from ..plotting.plot_settings import PlotSettings

__all__ = ['get_rf_feat_importance', 'rf_rank_features']


[docs]def get_rf_feat_importance(rf:ForestRegressor, inputs:pd.DataFrame, targets:np.ndarray, weights:Optional[np.ndarray]=None) -> pd.DataFrame:
    r'''
    Compute feature importance for a Random Forest model using rfpimp.

    Arguments:
        rf: trained Random Forest model
        inputs: input data as Pandas DataFrame
        targets: target data as Numpy array
        weights: Optional data weights as Numpy array
    '''
    return importances(rf, inputs, targets, features=inputs.columns, sample_weights=weights).reset_index()


[docs]def rf_rank_features(train_df:pd.DataFrame, val_df:pd.DataFrame, objective:str,
                     train_feats:List[str], targ_name:str='gen_target', wgt_name:Optional[str]=None,
                     importance_cut:float=0.0, n_estimators:int=40, n_rfs:int=1,
                     savename:Optional[str]=None, plot_settings:PlotSettings=PlotSettings()) -> List[str]:
    r'''
    Compute relative permutation importance of input features via using Random Forests.
    A reduced set of 'important features' is obtained by cutting on relative importance and a new model is trained and evaluated on this reduced set.
    RFs will have their hyper-parameters roughly optimised, both when training on all features and once when training on important features.
    Relative importances may be computed multiple times (via n_rfs) and averaged. In which case the standard error is also computed.

    Arguments:
        train_df: training data as Pandas DataFrame
        val_df: validation data as Pandas DataFrame
        objective: string representation of objective: either 'classification' or 'regression'
        train_feats: complete list of training features
        targ_name: name of column containing target data
        wgt_name: name of column containing weight data. If set, will use weights for training and evaluation, otherwise will not
        importance_cut: minimum importance required to be considered an 'important feature'
        n_estimators: number of trees to use in each forest
        n_rfs: number of trainings to perform on all training features in order to compute importances
        savename: Optional name of file to which to save the plot of feature importances
        plot_settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance

    Returns:
        List of features passing importance_cut, ordered by importance
    '''

    w_trn = None if wgt_name is None else train_df[wgt_name]
    w_val = None if wgt_name is None else val_df[wgt_name]
    print("Optimising RF")
    opt_params, rf = get_opt_rf_params(train_df[train_feats], train_df[targ_name], val_df[train_feats], val_df[targ_name],
                                       objective, w_trn=w_trn, w_val=w_val, verbose=False)
    print("Evalualting importances")
    fi = get_rf_feat_importance(rf, train_df[train_feats], train_df[targ_name], w_trn)
    orig_score = rf.score(val_df[train_feats], val_df[targ_name], w_val)
    if n_rfs > 1:
        m = RandomForestClassifier if 'class' in objective.lower() else RandomForestRegressor
        for _ in progress_bar(range(n_rfs-1)):
            rf = m(**opt_params)
            rf.fit(train_df[train_feats], train_df[targ_name], w_trn)
            fi = pd.merge(fi, get_rf_feat_importance(rf, train_df[train_feats], train_df[targ_name], w_trn), on='Feature', how='left')
            orig_score += rf.score(val_df[train_feats], val_df[targ_name], w_val)
        fi['Importance']  = np.mean(fi[[f for f in fi.columns if 'Importance' in f]].values, axis=1)
        fi['Uncertainty'] = np.std(fi[[f for f in fi.columns if 'Importance' in f]].values, ddof=1, axis=1)/np.sqrt(n_rfs)
        orig_score /= n_rfs
        fi.sort_values(by='Importance', ascending=False, inplace=True)
    print("Top ten most important features:\n", fi[['Feature', 'Importance']][:min(len(fi), 10)])
    plot_importance(fi[:min(len(fi), 30)], savename=savename, settings=plot_settings)

    top_feats = list(fi[fi.Importance > importance_cut].Feature)
    print(f"\n{len(top_feats)} features found with importance greater than {importance_cut}:\n", top_feats, '\n')
    if len(top_feats) == 0:
        print(f"Model score: :\t{orig_score:.5f}")
        print('No features found to be important, returning all training features. Good luck.')
        return train_feats
    if len(top_feats) < len(train_feats): 
        print("\nOptimising new RF")
        _, rf_new = get_opt_rf_params(train_df[top_feats], train_df[targ_name], val_df[top_feats], val_df[targ_name],
                                      objective, w_trn=w_trn, w_val=w_val, n_estimators=n_estimators, verbose=False)  
        print("Comparing RF scores, higher = better")                           
        print(f"All features:\t{orig_score:.5f}")
        print(f"Top features:\t{rf_new.score(val_df[top_feats], val_df[targ_name], w_val):.5f}")
    else:
        print('All training features found to be important')
    return top_feats
Source code for lumin.optimisation.features

Docs

Tutorials