Source code for lumin.utils.statistics
import math
import multiprocessing as mp
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
from statsmodels.nonparametric.kde import KDEUnivariate
__all__ = ["bootstrap_stats", "get_moments", "uncert_round"]
[docs]def bootstrap_stats(args: Dict[str, Any], out_q: Optional[mp.Queue] = None) -> Union[None, Dict[str, Any]]:
r"""
Computes statistics and KDEs of data via sampling with replacement
Arguments:
args: dictionary of arguments. Possible keys are:
data - data to resample
name - name prepended to returned keys in result dict
weights - array of weights matching length of data to use for weighted resampling
n - number of times to resample data
x - points at which to compute the kde values of resample data
kde - whether to compute the kde values at x-points for resampled data
mean - whether to compute the means of the resampled data
std - whether to compute standard deviation of resampled data
c68 - whether to compute the width of the absolute central 68.2 percentile of the resampled data
out_q: if using multiporcessing can place result dictionary in provided queue
Returns:
Result dictionary if `out_q` is `None` else `None`.
"""
out_dict, mean, median, std, c68, boot = {}, [], [], [], [], []
name = "" if "name" not in args else args["name"]
weights = None if "weights" not in args else args["weights"]
if "n" not in args:
args["n"] = 100
if "kde" not in args:
args["kde"] = False
if "mean" not in args:
args["mean"] = False
if "median" not in args:
args["median"] = False
if "std" not in args:
args["std"] = False
if "c68" not in args:
args["c68"] = False
if args["kde"] and args["data"].dtype != "float64":
data = np.array(args["data"], dtype="float64")
else:
data = args["data"]
len_d = len(data)
np.random.seed()
for i in range(args["n"]):
points = np.random.choice(data, len_d, replace=True, p=weights)
if args["kde"]:
kde = KDEUnivariate(points)
kde.fit()
boot.append([kde.evaluate(x) for x in args["x"]])
if args["mean"]:
mean.append(np.mean(points))
if args["median"]:
median.append(np.median(points))
if args["std"]:
std.append(np.std(points, ddof=1))
if args["c68"]:
c68.append((np.percentile(points, 84.135) - np.percentile(points, 15.865)) / 2)
if args["kde"]:
out_dict[f"{name}_kde"] = boot
if args["mean"]:
out_dict[f"{name}_mean"] = mean
if args["median"]:
out_dict[f"{name}_median"] = median
if args["std"]:
out_dict[f"{name}_std"] = std
if args["c68"]:
out_dict[f"{name}_c68"] = c68
if out_q is not None:
out_q.put(out_dict)
else:
return out_dict
[docs]def get_moments(arr: np.ndarray) -> Tuple[float, float, float, float]:
r"""
Computes mean and std of data, and their associated uncertainties
Arguments:
arr: univariate data
Returns:
- mean
- statistical uncertainty of mean
- standard deviation
- statistical uncertainty of standard deviation
"""
n = len(arr)
m = np.mean(arr)
m_4 = np.mean((arr - m) ** 4)
s = np.std(arr, ddof=1)
s4 = s**4
se_s2 = ((m_4 - (s4 * (n - 3) / (n - 1))) / n) ** 0.25
se_s = se_s2 / (2 * s)
return m, s / np.sqrt(n), s, se_s
[docs]def uncert_round(value: float, uncert: float) -> Tuple[float, float]:
r"""
Round value according to given uncertainty using one significant figure of the uncertainty
Arguments:
value: value to round
uncert: uncertainty of value
Returns:
- rounded value
- rounded uncertainty
"""
if uncert == math.inf:
return value, uncert
uncert = np.nan_to_num(uncert)
if uncert == 0:
return value, uncert
factor = 1.0
while uncert / factor > 1:
factor *= 10.0
value /= factor
uncert /= factor
i = 0
while uncert * (10**i) <= 1:
i += 1
round_uncert = factor * round(uncert, i)
round_value = factor * round(value, i)
if int(round_uncert) == round_uncert:
round_uncert = int(round_uncert)
round_value = int(round_value)
return round_value, round_uncert