Source code for pyhdtoolkit.maths.stats_fitting

"""
.. _maths-stats-fitting:

Stats Fitting
-------------

Module implementing methods to find the best
fit of statistical distributions to data.
"""

from __future__ import annotations

import warnings

from typing import TYPE_CHECKING

import matplotlib.pyplot as plt  # noqa: F401 | if omitted, get AttributeError: module 'matplotlib' has no attribute 'axes'
import numpy as np
import pandas as pd
import scipy.stats as st

from loguru import logger

if TYPE_CHECKING:
    from matplotlib.axes import Axes


# Distributions to check #
DISTRIBUTIONS: dict[st.rv_continuous, str] = {
    st.chi: "Chi",
    st.chi2: "Chi-Square",
    st.expon: "Exponential",
    st.laplace: "Laplace",
    st.lognorm: "LogNorm",
    st.norm: "Normal",
}


[docs] def set_distributions_dict(dist_dict: dict[st.rv_continuous, str]) -> None: """ .. versionadded:: 0.5.0 Sets ``DISTRIBUTIONS`` as the provided `dict`. This allows the user to define the distributions to try and fit against the data. One can find an example use of this function in the :ref:`distribution fitting gallery <demo-distributions-fitting>`. Warning ------- This function modifies the global `DISTRIBUTIONS` `dict` that is used by other functions in this module. It's not the cleanest way to do things that you'll ever see. Parameters ---------- dist_dict : dict[st.rv_continuous, str] dictionnary with the wanted distributions, in the same format as the ``DISTRIBUTIONS`` dict, aka with a `scipy.stats` generator object as key, and a string representation of their name as value. Example ------- .. code-block:: python import scipy.stats as st tested_dists = {st.chi: "Chi", st.expon: "Exponential", st.laplace: "Laplace"} set_distributions_dict(tested_dists) """ # pylint: disable=global-statement logger.debug("Setting tested distributions") global DISTRIBUTIONS # noqa: PLW0603 DISTRIBUTIONS = dist_dict
[docs] def best_fit_distribution( data: pd.Series | np.ndarray, bins: int = 200, ax: Axes = None ) -> tuple[st.rv_continuous, tuple[float, ...]]: r""" .. versionadded:: 0.5.0 Model data by finding the best fit candidate distribution among those in ``DISTRIBUTIONS``. One can find an example use of this function in the :ref:`gallery <demo-distributions-fitting>`. Parameters ---------- data : Union[pd.Series, np.ndarray] A `pandas.Series` or `numpy.ndarray` with your distribution data. bins : int The number of bins to decompose your data in before fitting. ax : matplotlib.axes.Axes, optional The `matplotlib.axes.Axes` on which to plot the probability density function of the different fitted distributions. This should be provided as the axis on which the distribution data is plotted, as it will add to that plot. If not provided, no plotting will be done. Returns ------- tuple[st.rv_continuous, tuple[float, ...]] A `tuple` containing the `scipy.stats` generator corresponding to the best fit to the data among the provided candidates, and the parameters for said generator to best fit the data. Example ------- .. code-block:: python best_fit_func, best_fit_params = best_fit_distribution(data, 200, axis) """ # pylint: disable=too-many-locals logger.debug(f"Getting histogram of original data, in {bins} bins") y, x = np.histogram(data, bins=bins, density=True) x = (x + np.roll(x, -1))[:-1] / 2.0 logger.debug("Creating initial guess") best_distribution = st.norm best_params = (0.0, 1.0) best_sse = np.inf # Estimate distribution parameters from data for distribution, distname in DISTRIBUTIONS.items(): try: with warnings.catch_warnings(): # Ignore warnings from data that can't be fit warnings.filterwarnings("ignore") logger.debug(f"Trying to fit distribution '{distname}'") params = distribution.fit(data) *args, loc, scale = params logger.debug(f"Calculating PDF goodness of fit and error for distribution '{distname}'") pdf = distribution.pdf(x, *args, loc=loc, scale=scale) sse = np.sum(np.power(y - pdf, 2.0)) try: if ax: # pragma: no cover logger.debug(f"Plotting fitted PDF for distribution '{distname}'") pd.Series(pdf, x).plot(ax=ax, label=f"{distname} fit", alpha=1) except Exception: # pragma: no cover # noqa: BLE001 logger.exception(f"Plotting distribution '{distname}' failed") logger.debug(f"Identifying if distribution '{distname}' is a better fit than previous tries") if best_sse > sse > 0: best_distribution = distribution best_params = params best_sse = sse except Exception: # noqa: BLE001 # pragma: no cover logger.exception(f"Trying to fit distribution '{distname}' failed and aborted") logger.info(f"Found a best fit: '{DISTRIBUTIONS[best_distribution]}' distribution") return best_distribution, best_params
[docs] def make_pdf(distribution: st.rv_continuous, params: tuple[float, ...], size: int = 25_000) -> pd.Series: """ .. versionadded:: 0.5.0 Generates a `pandas.Series` for the distributions's Probability Distribution Function. This Series will have axis values as index, and PDF values as column. One can find an example use of this function in the :ref:`distribution fitting gallery <demo-distributions-fitting>`. Parameters ---------- distribution : scipy.stats.rv_continuous The `scipy.stats` generator for the distribution to generate the PDF from. params : tuple[float, ...] The parameters for this generator, as given back by the fit to data or as guessed from the user. size : int The number of points to evaluate the PDF on. Returns ------- pandas.Series A `pandas.Series` with the PDF as values, and the corresponding axis values as index. Example ------- .. code-block:: python best_fit_func, best_fit_params = best_fit_distribution(data, 200, axis) pdf = fitting.make_pdf(best_fit_func, best_fit_params) """ # Separate parts of parameters *args, loc, scale = params logger.debug("Getting sane start and end points of distribution") if args: start = distribution.ppf(0.01, *args, loc=loc, scale=scale) end = distribution.ppf(0.99, *args, loc=loc, scale=scale) else: # pragma: no cover start = distribution.ppf(0.01, loc=loc, scale=scale) end = distribution.ppf(0.99, loc=loc, scale=scale) logger.debug("Building PDF") x = np.linspace(start, end, size) y = distribution.pdf(x, *args, loc=loc, scale=scale) return pd.Series(y, x)