Shortcuts

Source code for lumin.data_processing.pre_proc

import pickle
from collections import OrderedDict
from typing import List, Optional, Tuple, Union

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

__all__ = ["get_pre_proc_pipes", "fit_input_pipe", "fit_output_pipe", "proc_cats"]


[docs]def get_pre_proc_pipes( norm_in: bool = True, norm_out: bool = False, pca: bool = False, whiten: bool = False, with_mean: bool = True, with_std: bool = True, n_components: Optional[int] = None, ) -> Tuple[Pipeline, Pipeline]: r""" Configure SKLearn Pipelines for processing inputs and targets with the requested transformations. Arguments: norm_in: whether to apply StandardScaler to inputs norm_out: whether to apply StandardScaler to outputs pca: whether to apply PCA to inputs. Perforemed prior to StandardScaler. No dimensionality reduction is applied, purely rotation. whiten: whether PCA should whiten inputs. with_mean: whether StandardScalers should shift means to 0 with_std: whether StandardScalers should scale standard deviations to 1 n_components: if set, causes PCA to reduce the dimensionality of the input data Returns: Pipeline for input data Pipeline for target data """ steps_in = [] if not norm_in and not pca: steps_in.append(("ident", StandardScaler(with_mean=False, with_std=False))) # For compatability else: if pca: steps_in.append(("pca", PCA(n_components=n_components, whiten=whiten))) if norm_in: steps_in.append(("norm_in", StandardScaler(with_mean=with_mean, with_std=with_std))) input_pipe = Pipeline(steps=steps_in) steps_out = [] if norm_out: steps_out.append(("norm_out", StandardScaler(with_mean=with_mean, with_std=with_std))) else: steps_out.append(("ident", StandardScaler(with_mean=False, with_std=False))) # For compatability output_pipe = Pipeline(steps=steps_out) return input_pipe, output_pipe
[docs]def fit_input_pipe( df: pd.DataFrame, cont_feats: Union[str, List[str]], savename: Optional[str] = None, input_pipe: Optional[Pipeline] = None, norm_in: bool = True, pca: bool = False, whiten: bool = False, with_mean: bool = True, with_std: bool = True, n_components: Optional[int] = None, ) -> Pipeline: r""" Fit input pipeline to continuous features and optionally save. Arguments: df: DataFrame with data to fit pipeline cont_feats: (list of) column(s) to use as input data for fitting savename: if set will save the fitted Pipeline to with that name as Pickle (.pkl extension added automatically) input_pipe: if set will fit, otherwise will instantiate a new Pipeline norm_in: whether to apply StandardScaler to inputs. Only used if input_pipe is not set. pca: whether to apply PCA to inputs. Perforemed prior to StandardScaler. No dimensionality reduction is applied, purely rotation. Only used if input_pipe is not set. whiten: whether PCA should whiten inputs. Only used if input_pipe is not set. with_mean: whether StandardScalers should shift means to 0. Only used if input_pipe is not set. with_std: whether StandardScalers should scale standard deviations to 1. Only used if input_pipe is not set. n_components: if set, causes PCA to reduce the dimensionality of the input data. Only used if input_pipe is not set. Returns: Fitted Pipeline """ if input_pipe is None: input_pipe, _ = get_pre_proc_pipes( norm_in=norm_in, pca=pca, whiten=whiten, with_mean=with_mean, with_std=with_std, n_components=n_components ) input_pipe.fit(X=df[cont_feats].values.astype("float32")) if savename is not None: with open(f"{savename}.pkl", "wb") as fout: pickle.dump(input_pipe, fout) return input_pipe
[docs]def fit_output_pipe( df: pd.DataFrame, targ_feats: Union[str, List[str]], savename: Optional[str] = None, output_pipe: Optional[Pipeline] = None, norm_out: bool = True, ) -> Pipeline: r""" Fit output pipeline to target features and optionally save. Have you thought about using a y_range for regression instead? Arguments: df: DataFrame with data to fit pipeline targ_feats: (list of) column(s) to use as input data for fitting savename: if set will save the fitted Pipeline to with that name as Pickle (.pkl extension added automatically) output_pipe: if set will fit, otherwise will instantiate a new Pipeline norm_out: whether to apply StandardScaler to outputs . Only used if output_pipe is not set. Returns: Fitted Pipeline """ if output_pipe is None: _, output_pipe = get_pre_proc_pipes(norm_out=True) output_pipe.fit(X=df[targ_feats].values.astype("float32")) if savename is not None: with open(f"{savename}.pkl", "wb") as fout: pickle.dump(output_pipe, fout) return output_pipe
[docs]def proc_cats( train_df: pd.DataFrame, cat_feats: List[str], val_df: Optional[pd.DataFrame] = None, test_df: Optional[pd.DataFrame] = None, ) -> Tuple[OrderedDict, OrderedDict]: r""" Process categorical features in train_df to be valued 0->cardinality-1. Applied inplace. Applies same transformation to validation and testing data is passed. Will complain if validation or testing sets contain categories which are not present in the training data. Arguments: train_df: DataFrame with the training data, which will also be used to specify all the categories to consider cat_feats: list of columns to use as categorical features val_df: if set will apply the same category to code mapping to the validation data as was performed on the training data test_df: if set will apply the same category to code mapping to the testing data as was performed on the training data Returns: ordered dictionary mapping categorical features to dictionaries mapping codes to categories ordered dictionary mapping categorical features to their cardinalities """ # TODO: check how this handles non-numerical categories cat_maps = OrderedDict() cat_szs = OrderedDict() for feat in cat_feats: cat_maps[feat] = {} vals = sorted(set(train_df[feat])) cat_szs[feat] = len(vals) if val_df is not None: if sorted(set(val_df[feat])) != vals: raise Exception( f"Feature {feat} declared categorical, but validation set contains categories different to the training set" ) if test_df is not None: if sorted(set(test_df[feat])) != vals: raise Exception( f"Feature {feat} declared categorical, but testing set contains categories different to the training set" ) for i, val in enumerate(vals): train_df.loc[train_df[feat] == val, feat] = i if val_df is not None: val_df.loc[val_df[feat] == val, feat] = i if test_df is not None: test_df.loc[test_df[feat] == val, feat] = i cat_maps[feat][i] = val return cat_maps, cat_szs

Docs

Access comprehensive developer and user documentation for LUMIN

View Docs

Tutorials

Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials