Source code for clusterking.data.dwe

#!/usr/bin/env python3

# 3rd
import numpy as np
from typing import List, Optional, Dict, Any

# ours
from clusterking.data.data import Data
from clusterking.maths.statistics import (
    cov2err,
    cov2corr,
    abs2rel_cov,
    corr2cov,
)


[docs]class DataWithErrors(Data):
    """ This class extends the :py:class:`~clusterking.data.Data` class by
    convenient and performant ways to add errors to the distributions.

    See the description of the :py:class:`~clusterking.data.Data` class for more
    information about the data structure itself.

    There are three basic ways to add errors:

    1. Add relative errors (with correlation) relative to the bin content of
       each bin in the distribution: ``add_rel_err_...``
       (:math:`\\mathrm{Cov}^{(k)}_{\\text{rel}}(i, j)`)
    2. Add absolute errors (with correlation): ``add_err_...``
       (:math:`\\mathrm{Cov}^{(k)}_{\\text{abs}}(i, j)`)
    3. Add poisson errors:
       :py:meth:`.add_err_poisson`

    The covariance matrix for bin i and j of distribution n
    (with contents :math:`d^{(n)}_i`) will then
    be

    .. math::

        \\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j)  =
                &\\sum_{k}\\mathrm{Cov}_{\\text{rel}}^{(k)}(i, j)
                \\cdot d^{(n)}_i d^{(n)}_j + \\\\
            + &\\sum_k\\mathrm{Cov}_{\\text{abs}}^{(k)}(i, j) + \\\\
            + &\\delta_{ij} \\sqrt{d^{(n)}_i d^{(n)}_j} / \\sqrt{s}

    .. note::
        All of these methods add the errors in a consistent way for all sample
        points/distributions, i.e. it is impossible to add a certain error
        specifically to one sample point only!

    Afterwards, you can get errors, correlation and covariance matrices for
    every data point by using one of the methods such as
    :meth:`.cov`, :meth:`.corr`, :meth:`err`.

    .. note::
        When saving your dataset, your error configuration is saved as well,
        so you can reload it like any other :class:`~clusterking.data.Data`
        or :class:`~clusterking.data.DFMD` object.

    .. warning::
        The appendix of our paper mistakenly hinted at the unit of the
        relative uncertainties being in percent. This is not the case.
        That means that ``d.add_rel_err_uncorr(0.1)`` adds a 10% relative
        uncertainty, not 0.1%.

    Args:
        data: n x nbins matrix
    """

[docs]    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Initialize some values to their default
        # [Defined as properties, documented below]
        self.rel_cov = None
        self.abs_cov = None
        self.poisson_errors = False
        self.poisson_errors_scale = 1.0

    # **************************************************************************
    # Properties
    # **************************************************************************

    @property
    def rel_cov(self):
        """Relative covariance matrix that will be later applied to the data
        (see class documentation).

        .. math::

            \\mathrm{Cov}_{\\text{rel}}(i, j)
            = \\sum_k\\mathrm{Cov}_{\\text{rel}}^{(k)}(i, j)

        If no errors have been added, this is defined to be a zero matrix.

        Returns:
            ``self.nbins * self.nbins`` matrix
        """
        value = self.md["errors"]["rel_cov"]
        if value is None:
            if self.nbins:
                return np.zeros((self.nbins, self.nbins))
            else:
                return None
        return np.array(value)

    @rel_cov.setter
    def rel_cov(self, value):
        if isinstance(value, np.ndarray):
            # convert to list in order to serialize
            value = value.tolist()
        self.md["errors"]["rel_cov"] = value

    @property
    def abs_cov(self):
        """Absolute covariance matrix that will be later applied to the data
        (see class documentation).

        .. math::

            \\mathrm{Cov}_{\\text{abs}}(i, j)
            = \\sum_k\\mathrm{Cov}_{\\text{abs}}^{(k)}(i, j)

        If no errors have been added, this is defined to be a zero matrix.

        Returns:
            ``self.nbins * self.nbins`` matrix
        """
        value = self.md["errors"]["abs_cov"]
        if value is None:
            if self.nbins:
                return np.zeros((self.nbins, self.nbins))
            else:
                return None
        return np.array(value)

    @abs_cov.setter
    def abs_cov(self, value):
        if isinstance(value, np.ndarray):
            # convert to list in order to serialize
            value = value.tolist()
        self.md["errors"]["abs_cov"] = value

    @property
    def poisson_errors(self) -> bool:
        """Should poisson errors be added?"""
        return self.md["errors"]["poisson"]

    @poisson_errors.setter
    def poisson_errors(self, value):
        self.md["errors"]["poisson"] = value

    @property
    def poisson_errors_scale(self) -> float:
        """Scale poisson errors. See documentation of :meth:`add_err_poisson`."""
        return self.md["errors"]["poisson_scale"]

    @poisson_errors_scale.setter
    def poisson_errors_scale(self, value):
        self.md["errors"]["poisson_scale"] = value

    # **************************************************************************
    # Internal helper functions
    # **************************************************************************

    def _interpret_input(self, inpt, what: str) -> np.ndarray:
        """Interpret user input

        Args:
            inpt: User input
            what: 'err', 'corr', 'cov'

        Returns:
            correctly shaped numpy array
        """
        inpt = np.array(inpt, float)
        if what.lower() in ["err"]:
            if inpt.ndim == 0:
                return np.tile(inpt, self.nbins)
            if inpt.ndim == 1:
                return inpt
            else:
                raise ValueError(
                    "Wrong dimension ({}) of {} array.".format(inpt.ndim, what)
                )
        elif what.lower() in ["corr", "cov"]:
            if inpt.ndim == 2:
                return inpt
            else:
                raise ValueError(
                    "Wrong dimension ({}) of {} array.".format(inpt.ndim, what)
                )
        else:
            raise ValueError("Unknown value what='{}'.".format(what))

    # **************************************************************************
    # Actual calculations
    # **************************************************************************

[docs]    def cov(self, relative=False) -> np.ndarray:
        """Return covariance matrix :math:`\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j)`

        If no errors have been added, a zero matrix is returned.

        Args:
            relative: "Relative to data", i.e.
                :math:`\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j) /
                (d^{(n)}_i \\cdot d^{(n)}_j)`

        Returns:
            ``self.n x self.nbins x self.nbins`` array
        """

        data = self.data()
        cov = np.tile(self.abs_cov, (self.n, 1, 1))
        cov += np.einsum("ij,ki,kj->kij", self.rel_cov, data, data)
        if self.poisson_errors:
            cov += corr2cov(
                np.tile(np.eye(self.nbins), (self.n, 1, 1)),
                # Normal poisson errors are sqrt(data_i). What happens if
                # data is normalized from N to N', i.e.
                #   Sum(data_normalized) = N'?
                # Let N/N' = scale
                # Then the errors should be
                #   sqrt(data) / scale = sqrt(data/scale) / sqrt(scale) =
                #   = sqrt(data_normalized) / sqrt(scale)
                # Hence:
                np.sqrt(data) / np.sqrt(self.poisson_errors_scale),
            )

        if not relative:
            return cov
        else:
            return abs2rel_cov(cov, data)

[docs]    def corr(self) -> np.ndarray:
        """Return correlation matrix. If covariance matrix is empty (because
        no errors have been added), a unit matrix is returned.

        Returns:
            ``self.n x self.nbins x self.nbins`` array
        """
        if np.sum(np.abs(self.cov())) == 0.0:
            return np.tile(np.eye(self.nbins), (self.n, 1, 1))
        return cov2corr(self.cov())

[docs]    def err(self, relative=False) -> np.ndarray:
        """Return errors per bin, i.e.
        :math:`e_i^{(n)} = \\sqrt{\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_i)}`

        Args:
            relative: Relative errors, i.e. :math:`e_i^{(n)}/d_i^{(n)}`

        Returns:
            ``self.n x self.nbins`` array
        """
        if not relative:
            return cov2err(self.cov())
        else:
            return cov2err(self.cov()) / self.data()

    # **************************************************************************
    # Configuration
    # **************************************************************************

[docs]    def reset_errors(self) -> None:
        """Set all errors back to 0.

        Returns:
            None
        """
        self.rel_cov = None
        self.abs_cov = None
        self.poisson_errors = False
        self.poisson_errors_scale = 1

    # -------------------------------------------------------------------------
    # Add absolute errors
    # -------------------------------------------------------------------------

[docs]    def add_err_cov(self, cov) -> None:
        """Add error from covariance matrix.

        Args:
            cov: ``self.n x self.nbins x self.nbins`` array of covariance
                matrices or self.nbins x self.nbins covariance matrix (if equal
                for all data points)
        """
        cov = self._interpret_input(cov, "cov")
        self.abs_cov += cov

[docs]    def add_err_corr(self, err, corr) -> None:
        """Add error from errors vector and correlation matrix.

        Args:
            err: ``self.n x self.nbins`` vector of errors for each data point
                and bin or self.nbins vector of uniform errors per data point or
                float (uniform error per bin and datapoint)
            corr: ``self.n x self.nbins x self.nbins`` correlation matrices
                or ``self.nbins x self.nbins`` correlation matrix
        """
        err = self._interpret_input(err, "err")
        corr = self._interpret_input(corr, "corr")
        self.add_err_cov(corr2cov(corr, err))

[docs]    def add_err_uncorr(self, err) -> None:
        """
        Add uncorrelated error.

        Args:
            err: see argument of :py:meth:`.add_err_corr`
        """
        err = self._interpret_input(err, "err")
        corr = np.identity(self.nbins)
        self.add_err_corr(err, corr)

[docs]    def add_err_maxcorr(self, err) -> None:
        """
        Add maximally correlated error.

        Args:
            err: see argument of :py:meth:`.add_err_corr`
        """
        err = self._interpret_input(err, "err")
        corr = np.ones((self.nbins, self.nbins))
        self.add_err_corr(err, corr)

    # -------------------------------------------------------------------------
    # Add relative errors
    # -------------------------------------------------------------------------

[docs]    def add_rel_err_cov(self, cov) -> None:
        """
        Add error from "relative" covariance matrix

        Args:
            cov: see argument of :py:meth:`.add_err_cov`
        """
        cov = self._interpret_input(cov, "cov")
        self.rel_cov += cov

[docs]    def add_rel_err_corr(self, err, corr) -> None:
        """
        Add error from relative errors and correlation matrix.
        ``err=0.1`` means 10% uncertainty.

        Args:
            err: see argument of :py:meth:`.add_err_corr`
            corr: see argument of :py:meth:`.add_err_corr`
        """
        err = self._interpret_input(err, "err")
        corr = self._interpret_input(corr, "corr")
        self.add_rel_err_cov(corr2cov(corr, err))

[docs]    def add_rel_err_uncorr(self, err) -> None:
        """
        Add uncorrelated relative uncertainty. ``err=0.1`` means 10% uncertainty.

        Args:
            err: see argument of
                :py:meth:`.add_err_corr`
        """
        err = self._interpret_input(err, "err")
        corr = np.identity(self.nbins)
        self.add_rel_err_corr(err, corr)

[docs]    def add_rel_err_maxcorr(self, err) -> None:
        """
        Add maximally correlated relative error. ``err=0.1`` means 10%
        uncertainty.

        Args:
            err: see argument of :py:meth:`.add_err_corr`
        """
        err = self._interpret_input(err, "err")
        corr = np.ones((self.nbins, self.nbins))
        self.add_rel_err_corr(err, corr)

    # -------------------------------------------------------------------------
    # Other forms of errors
    # -------------------------------------------------------------------------

[docs]    def add_err_poisson(self, normalization_scale=1) -> None:
        """
        Add poisson errors/statistical errors.

        Args:
            normalization_scale: Apply poisson errors corresponding to data
                normalization scaled up by this factor. For example, if your
                data is normalized to 1 and you still want to apply Poisson
                errors that correspond to a yield of 200, you can call
                ``add_err_poisson(200)``. Your data will stay normalized, but
                the poisson errors are appropriate for a total yield of 200.

        Returns:
            None
        """
        if self.poisson_errors:
            self.log.warning("Poisson errors had already been added before.")
            if normalization_scale != self.poisson_errors_scale:
                self.log.warning(
                    "However we CHANGED (not added to) the scaling of the "
                    "Poisson errors."
                )
        self.poisson_errors = True
        self.poisson_errors_scale = normalization_scale

    # **************************************************************************
    # Quick plots
    # **************************************************************************

[docs]    def plot_dist_err(
        self,
        cluster_column="cluster",
        bpoint_column="bpoint",
        title: Optional[str] = None,
        clusters: Optional[List[int]] = None,
        bpoints=True,
        legend=True,
        hist_kwargs: Optional[Dict[str, Any]] = None,
        hist_fill_kwargs: Optional[Dict[str, Any]] = None,
        ax=None,
    ):
        """Plot distribution with errors.

        Args:
            cluster_column: Column with the cluster names (default 'cluster')
            bpoint_column: Column with bpoints (default 'bpoint')
            title: Plot title (``None``: automatic)
            clusters: List of clusters to selected or single cluster.
                If ``None`` (default), all clusters are chosen.
            bpoints: Draw benchmark points if available (default True). If
                false or not benchmark points are available, pick a random
                sample point for each cluster.
            legend: Draw legend? (default True)
            hist_kwargs: Keyword arguments to
                :meth:`~clusterking.plots.plot_histogram.plot_histogram`
            hist_fill_kwargs: Keyword arguments to
                :meth:`~clusterking.plots.plot_histogram.plot_histogram_fill`
            ax: Instance of `matplotlib.axes.Axes` to plot on. If ``None``, a
                new one is instantiated.

        Note: To customize these kind of plots further, check the
        :py:class:`~clusterking.plots.plot_bundles.BundlePlot` class and the
        :py:meth:`~clusterking.plots.plot_bundles.BundlePlot.err_plot`
        method thereof.

        Returns:
            Figure
        """
        from clusterking.plots.plot_bundles import BundlePlot

        bp = BundlePlot(self)
        bp.cluster_column = cluster_column
        bp.bpoint_column = bpoint_column
        bp.title = title
        bp.draw_legend = legend
        bp.err_plot(
            clusters=clusters,
            bpoints=bpoints,
            hist_kwargs=hist_kwargs,
            hist_fill_kwargs=hist_fill_kwargs,
            ax=ax,
        )
        return bp.fig
ClusterKinG 1.1.0 documentation

Source code for clusterking.data.dwe