Shortcuts

Source code for lumin.data_processing.file_proc

import h5py
import numpy as np
import pandas as pd
from typing import List, Union, Optional, Any, Tuple, Dict
import os
from pathlib import Path
import json

from sklearn.model_selection import StratifiedKFold, KFold

__all__ = ['save_to_grp', 'fold2foldfile', 'df2foldfile', 'add_meta_data']


[docs]def save_to_grp(arr:np.ndarray, grp:h5py.Group, name:str, compression:Optional[str]=None) -> None: r''' Save Numpy array as a dataset in an h5py Group Arguments: arr: array to be saved grp: group in which to save arr name: name of dataset to create compression: optional compression argument for h5py, e.g. 'lzf' ''' # TODO Option for string length grp.create_dataset(name, shape=arr.shape, dtype=arr.dtype.name if arr.dtype.name not in ['object', 'str864'] else 'S64', data=arr if arr.dtype.name not in ['object', 'str864'] else arr.astype('S64'), compression=compression)
def _build_matrix_lookups(feats:List[str], vecs:List[str], feats_per_vec:List[str], row_wise:bool) -> Tuple[List[str],np.ndarray,Tuple[int,int]]: shape = (len(vecs),len(feats_per_vec)) if row_wise else (len(feats_per_vec),len(vecs)) lookup,missing = np.zeros(shape, dtype=np.array(feats).dtype),np.zeros(shape, dtype=np.bool) if row_wise: for i, v in enumerate(vecs): for j, c in enumerate(feats_per_vec): f = f'{v}_{c}' if f in feats: lookup[i,j] = f else: lookup[i,j] = feats[0] # Temp value, to be set to null later using missing missing[i,j] = True else: for j, v in enumerate(vecs): for i, c in enumerate(feats_per_vec): f = f'{v}_{c}' if f in feats: lookup[i,j] = f else: lookup[i,j] = feats[0] # Temp value, to be set to null later using missing missing[i,j] = True return list(lookup.flatten()),missing.flatten(),shape
[docs]def fold2foldfile(df:pd.DataFrame, out_file:h5py.File, fold_idx:int, cont_feats:List[str], cat_feats:List[str], targ_feats:Union[str,List[str]], targ_type:Any, misc_feats:Optional[List[str]]=None, wgt_feat:Optional[str]=None, matrix_lookup:Optional[List[str]]=None, matrix_missing:Optional[np.ndarray]=None, matrix_shape:Optional[Tuple[int,int]]=None, tensor_data:Optional[np.ndarray]=None, compression:Optional[str]=None) -> None: r''' Save fold of data into an h5py Group Arguments: df: Dataframe from which to save data out_file: h5py file to save data in fold_idx: ID for the fold; used name h5py group according to 'fold_{fold_idx}' cont_feats: list of columns in df to save as continuous variables cat_feats: list of columns in df to save as discreet variables targ_feats: (list of) column(s) in df to save as target feature(s) targ_type: type of target feature, e.g. int,'float32' misc_feats: any extra columns to save wgt_feat: column to save as data weights matrix_vecs: list of objects for matrix encoding, i.e. feature prefixes matrix_feats_per_vec: list of features per vector for matrix encoding, i.e. feature suffixes. Features listed but not present in df will be replaced with NaN. matrix_row_wise: whether objects encoded as a matrix should be encoded row-wise (i.e. all the features associated with an object are in their own row), or column-wise (i.e. all the features associated with an object are in their own column) tensor_data: data of higher order than a matrix can be passed directly as a numpy array, rather than beign extracted and reshaped from the DataFrame. The array will be saved under matrix data, and this is incompatible with also setting `matrix_lookup`, `matrix_missing`, and `matrix_shape`. The first dimension of the array must be compatible with the length of the data frame. compression: optional compression argument for h5py, e.g. 'lzf' ''' # TODO infer target type automatically grp = out_file.create_group(f'fold_{fold_idx}') save_to_grp(np.hstack((df[cont_feats].values.astype('float32'), df[cat_feats].values.astype('float32'))), grp, 'inputs', compression=compression) save_to_grp(df[targ_feats].values.astype(targ_type), grp, 'targets', compression=compression) if wgt_feat is not None: if wgt_feat in df.columns: save_to_grp(df[wgt_feat].values.astype('float32'), grp, 'weights', compression=compression) else: print(f'{wgt_feat} not found in file') if misc_feats is not None: for f in misc_feats: if f in df.columns: save_to_grp(df[f].values, grp, f, compression=compression) else: print(f'{f} not found in file') if matrix_lookup is not None: if tensor_data is not None: raise ValueError("The saving of both matrix and tensor data is requested. This is ambiguous. Please only set one of the other.") mat = df[matrix_lookup].values.astype('float32') mat[:,matrix_missing] = np.NaN mat = mat.reshape((len(df),*matrix_shape)) save_to_grp(mat, grp, 'matrix_inputs', compression=compression) elif tensor_data is not None: save_to_grp(tensor_data.astype('float32'), grp, 'matrix_inputs', compression=compression)
[docs]def df2foldfile(df:pd.DataFrame, n_folds:int, cont_feats:List[str], cat_feats:List[str], targ_feats:Union[str,List[str]], savename:Union[Path,str], targ_type:str, strat_key:Optional[str]=None, misc_feats:Optional[List[str]]=None, wgt_feat:Optional[str]=None, cat_maps:Optional[Dict[str,Dict[int,Any]]]=None, matrix_vecs:Optional[List[str]]=None, matrix_feats_per_vec:Optional[List[str]]=None, matrix_row_wise:Optional[bool]=None, tensor_data:Optional[np.ndarray]=None, tensor_name:Optional[str]=None, tensor_is_sparse:bool=False, compression:Optional[str]=None) -> None: r''' Convert dataframe into h5py file by splitting data into sub-folds to be accessed by a :class:`~lumin.nn.data.fold_yielder.FoldYielder` Arguments: df: Dataframe from which to save data n_folds: number of folds to split df into cont_feats: list of columns in df to save as continuous variables cat_feats: list of columns in df to save as discreet variables targ_feats: (list of) column(s) in df to save as target feature(s) savename: name of h5py file to create (.h5py extension not required) targ_type: type of target feature, e.g. int,'float32' strat_key: column to use for stratified splitting misc_feats: any extra columns to save wgt_feat: column to save as data weights cat_maps: Dictionary mapping categorical features to dictionary mapping codes to categories matrix_vecs: list of objects for matrix encoding, i.e. feature prefixes matrix_feats_per_vec: list of features per vector for matrix encoding, i.e. feature suffixes. Features listed but not present in df will be replaced with NaN. matrix_row_wise: whether objects encoded as a matrix should be encoded row-wise (i.e. all the features associated with an object are in their own row), or column-wise (i.e. all the features associated with an object are in their own column) tensor_data: data of higher order than a matrix can be passed directly as a numpy array, rather than beign extracted and reshaped from the DataFrame. The array will be saved under matrix data, and this is incompatible with also setting `matrix_vecs`, `matrix_feats_per_vec`, and `matrix_row_wise`. The first dimension of the array must be compatible with the length of the data frame. tensor_name: if `tensor_data` is set, then this is the name that will to the foldfile's metadata. tensor_is_sparse: Set to True if the matrix is in sparse COO format and should be densified later on The format expected is `coo_x = sparse.as_coo(x); m = np.vstack((coo_x.data, coo_x.coords))`, where `m` is the tensor passed to `tensor_data`. compression: optional compression argument for h5py, e.g. 'lzf' ''' savename = str(savename) os.system(f'rm {savename}.hdf5') os.makedirs(savename[:savename.rfind('/')], exist_ok=True) out_file = h5py.File(f'{savename}.hdf5', "w") lookup,missing,shape = None,None,None if matrix_vecs is not None: if tensor_data is not None: raise ValueError("The saving of both matrix and tensor data is requested. This is ambiguous. Please only set one of the other.") lookup,missing,shape = _build_matrix_lookups(df.columns, matrix_vecs, matrix_feats_per_vec, matrix_row_wise) mat_feats = list(np.array(lookup)[np.logical_not(missing)]) # Only features present in data dup = [f for f in cont_feats if f in mat_feats] if len(dup) > 1: print(f'{dup} present in both matrix features and continuous features; removing from continuous features') cont_feats = [f for f in cont_feats if f not in dup] if strat_key is not None and strat_key not in df.columns: print(f'{strat_key} not found in DataFrame') strat_key = None if strat_key is None: kf = KFold(n_splits=n_folds, shuffle=True) folds = kf.split(X=df) else: kf = StratifiedKFold(n_splits=n_folds, shuffle=True) folds = kf.split(X=df, y=df[strat_key]) for fold_idx, (_, fold) in enumerate(folds): print(f"Saving fold {fold_idx} with {len(fold)} events") fold2foldfile(df.iloc[fold].copy(), out_file, fold_idx, cont_feats=cont_feats, cat_feats=cat_feats, targ_feats=targ_feats, targ_type=targ_type, misc_feats=misc_feats, wgt_feat=wgt_feat, matrix_lookup=lookup, matrix_missing=missing, matrix_shape=shape, tensor_data=tensor_data[fold] if tensor_data is not None else None, compression=compression) add_meta_data(out_file=out_file, feats=df.columns, cont_feats=cont_feats, cat_feats=cat_feats, cat_maps=cat_maps, targ_feats=targ_feats, wgt_feat=wgt_feat, matrix_vecs=matrix_vecs, matrix_feats_per_vec=matrix_feats_per_vec, matrix_row_wise=matrix_row_wise, tensor_name=tensor_name, tensor_shp=tensor_data[0].shape if tensor_data is not None else None, tensor_is_sparse=tensor_is_sparse)
[docs]def add_meta_data(out_file:h5py.File, feats:List[str], cont_feats:List[str], cat_feats:List[str], cat_maps:Optional[Dict[str,Dict[int,Any]]], targ_feats:Union[str,List[str]], wgt_feat:Optional[str]=None, matrix_vecs:Optional[List[str]]=None, matrix_feats_per_vec:Optional[List[str]]=None, matrix_row_wise:Optional[bool]=None, tensor_name:Optional[str]=None, tensor_shp:Optional[Tuple[int]]=None, tensor_is_sparse:bool=False) -> None: r''' Adds meta data to foldfile containing information about the data: feature names, matrix information, etc. :class:`~lumin.nn.data.fold_yielder.FoldYielder` objects will access this and automatically extract it to save the user from having to manually pass lists of features. Arguments: out_file: h5py file to save data in feats: list of all features in data cont_feats: list of continuous features cat_feats: list of categorical features cat_maps: Dictionary mapping categorical features to dictionary mapping codes to categories targ_feats: (list of) target feature(s) wgt_feat: name of weight feature matrix_vecs: list of objects for matrix encoding, i.e. feature prefixes matrix_feats_per_vec: list of features per vector for matrix encoding, i.e. feature suffixes. Features listed but not present in df will be replaced with NaN. matrix_row_wise: whether objects encoded as a matrix should be encoded row-wise (i.e. all the features associated with an object are in their own row), or column-wise (i.e. all the features associated with an object are in their own column) tensor_name: Name used to refer to the tensor when displaying model information tensor_shp: The shape of the tensor data (exclusing batch dimension) tensor_is_sparse: Whether the tensor is sparse (COO format) and should be densified prior to use ''' grp = out_file.create_group('meta_data') grp.create_dataset('cont_feats', data=json.dumps(cont_feats)) grp.create_dataset('cat_feats', data=json.dumps(cat_feats)) grp.create_dataset('targ_feats', data=json.dumps(targ_feats)) if wgt_feat is not None: grp.create_dataset('wgt_feat', data=json.dumps(wgt_feat)) if cat_maps is not None: grp.create_dataset('cat_maps', data=json.dumps(cat_maps)) if matrix_vecs is not None: lookup,missing,shape = _build_matrix_lookups(feats, matrix_vecs, matrix_feats_per_vec, matrix_row_wise) use = list(np.array(lookup)[np.logical_not(missing)]) # Only features present in data grp.create_dataset('matrix_feats', data=json.dumps({'present_feats': use, 'vecs': matrix_vecs, 'missing': [int(m) for m in missing], 'feats_per_vec': matrix_feats_per_vec, 'row_wise': matrix_row_wise, 'shape': shape})) elif tensor_name is not None: grp.create_dataset('matrix_feats', data=json.dumps({'present_feats': [tensor_name], 'vecs': [tensor_name], 'missing': [], 'feats_per_vec': [''], 'row_wise': None, 'shape': tensor_shp, 'is_sparse':tensor_is_sparse}))
Read the Docs v: v0.8.0
Versions
latest
stable
v0.8.0
v0.7.2
v0.7.1
v0.7.0
v0.6.0
v0.5.1
v0.5.0
v0.4.0.1
v0.3.1
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.

Docs

Access comprehensive developer and user documentation for LUMIN

View Docs

Tutorials

Get tutorials for beginner and advanced researchers demonstrating many of the features of LUMIN

View Tutorials