#!/usr/bin/env python3
# 3rd
import numpy as np
from typing import List, Optional, Dict, Any
# ours
from clusterking.data.data import Data
from clusterking.maths.statistics import (
cov2err,
cov2corr,
abs2rel_cov,
corr2cov,
)
[docs]class DataWithErrors(Data):
""" This class extends the :py:class:`~clusterking.data.Data` class by
convenient and performant ways to add errors to the distributions.
See the description of the :py:class:`~clusterking.data.Data` class for more
information about the data structure itself.
There are three basic ways to add errors:
1. Add relative errors (with correlation) relative to the bin content of
each bin in the distribution: ``add_rel_err_...``
(:math:`\\mathrm{Cov}^{(k)}_{\\text{rel}}(i, j)`)
2. Add absolute errors (with correlation): ``add_err_...``
(:math:`\\mathrm{Cov}^{(k)}_{\\text{abs}}(i, j)`)
3. Add poisson errors:
:py:meth:`.add_err_poisson`
The covariance matrix for bin i and j of distribution n
(with contents :math:`d^{(n)}_i`) will then
be
.. math::
\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j) =
&\\sum_{k}\\mathrm{Cov}_{\\text{rel}}^{(k)}(i, j)
\\cdot d^{(n)}_i d^{(n)}_j + \\\\
+ &\\sum_k\\mathrm{Cov}_{\\text{abs}}^{(k)}(i, j) + \\\\
+ &\\delta_{ij} \\sqrt{d^{(n)}_i d^{(n)}_j} / \\sqrt{s}
.. note::
All of these methods add the errors in a consistent way for all sample
points/distributions, i.e. it is impossible to add a certain error
specifically to one sample point only!
Afterwards, you can get errors, correlation and covariance matrices for
every data point by using one of the methods such as
:meth:`.cov`, :meth:`.corr`, :meth:`err`.
.. note::
When saving your dataset, your error configuration is saved as well,
so you can reload it like any other :class:`~clusterking.data.Data`
or :class:`~clusterking.data.DFMD` object.
.. warning::
The appendix of our paper mistakenly hinted at the unit of the
relative uncertainties being in percent. This is not the case.
That means that ``d.add_rel_err_uncorr(0.1)`` adds a 10% relative
uncertainty, not 0.1%.
Args:
data: n x nbins matrix
"""
[docs] def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Initialize some values to their default
# [Defined as properties, documented below]
self.rel_cov = None
self.abs_cov = None
self.poisson_errors = False
self.poisson_errors_scale = 1.0
# **************************************************************************
# Properties
# **************************************************************************
@property
def rel_cov(self):
"""Relative covariance matrix that will be later applied to the data
(see class documentation).
.. math::
\\mathrm{Cov}_{\\text{rel}}(i, j)
= \\sum_k\\mathrm{Cov}_{\\text{rel}}^{(k)}(i, j)
If no errors have been added, this is defined to be a zero matrix.
Returns:
``self.nbins * self.nbins`` matrix
"""
value = self.md["errors"]["rel_cov"]
if value is None:
if self.nbins:
return np.zeros((self.nbins, self.nbins))
else:
return None
return np.array(value)
@rel_cov.setter
def rel_cov(self, value):
if isinstance(value, np.ndarray):
# convert to list in order to serialize
value = value.tolist()
self.md["errors"]["rel_cov"] = value
@property
def abs_cov(self):
"""Absolute covariance matrix that will be later applied to the data
(see class documentation).
.. math::
\\mathrm{Cov}_{\\text{abs}}(i, j)
= \\sum_k\\mathrm{Cov}_{\\text{abs}}^{(k)}(i, j)
If no errors have been added, this is defined to be a zero matrix.
Returns:
``self.nbins * self.nbins`` matrix
"""
value = self.md["errors"]["abs_cov"]
if value is None:
if self.nbins:
return np.zeros((self.nbins, self.nbins))
else:
return None
return np.array(value)
@abs_cov.setter
def abs_cov(self, value):
if isinstance(value, np.ndarray):
# convert to list in order to serialize
value = value.tolist()
self.md["errors"]["abs_cov"] = value
@property
def poisson_errors(self) -> bool:
"""Should poisson errors be added?"""
return self.md["errors"]["poisson"]
@poisson_errors.setter
def poisson_errors(self, value):
self.md["errors"]["poisson"] = value
@property
def poisson_errors_scale(self) -> float:
"""Scale poisson errors. See documentation of :meth:`add_err_poisson`."""
return self.md["errors"]["poisson_scale"]
@poisson_errors_scale.setter
def poisson_errors_scale(self, value):
self.md["errors"]["poisson_scale"] = value
# **************************************************************************
# Internal helper functions
# **************************************************************************
def _interpret_input(self, inpt, what: str) -> np.ndarray:
"""Interpret user input
Args:
inpt: User input
what: 'err', 'corr', 'cov'
Returns:
correctly shaped numpy array
"""
inpt = np.array(inpt, float)
if what.lower() in ["err"]:
if inpt.ndim == 0:
return np.tile(inpt, self.nbins)
if inpt.ndim == 1:
return inpt
else:
raise ValueError(
"Wrong dimension ({}) of {} array.".format(inpt.ndim, what)
)
elif what.lower() in ["corr", "cov"]:
if inpt.ndim == 2:
return inpt
else:
raise ValueError(
"Wrong dimension ({}) of {} array.".format(inpt.ndim, what)
)
else:
raise ValueError("Unknown value what='{}'.".format(what))
# **************************************************************************
# Actual calculations
# **************************************************************************
[docs] def cov(self, relative=False) -> np.ndarray:
"""Return covariance matrix :math:`\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j)`
If no errors have been added, a zero matrix is returned.
Args:
relative: "Relative to data", i.e.
:math:`\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_j) /
(d^{(n)}_i \\cdot d^{(n)}_j)`
Returns:
``self.n x self.nbins x self.nbins`` array
"""
data = self.data()
cov = np.tile(self.abs_cov, (self.n, 1, 1))
cov += np.einsum("ij,ki,kj->kij", self.rel_cov, data, data)
if self.poisson_errors:
cov += corr2cov(
np.tile(np.eye(self.nbins), (self.n, 1, 1)),
# Normal poisson errors are sqrt(data_i). What happens if
# data is normalized from N to N', i.e.
# Sum(data_normalized) = N'?
# Let N/N' = scale
# Then the errors should be
# sqrt(data) / scale = sqrt(data/scale) / sqrt(scale) =
# = sqrt(data_normalized) / sqrt(scale)
# Hence:
np.sqrt(data) / np.sqrt(self.poisson_errors_scale),
)
if not relative:
return cov
else:
return abs2rel_cov(cov, data)
[docs] def corr(self) -> np.ndarray:
"""Return correlation matrix. If covariance matrix is empty (because
no errors have been added), a unit matrix is returned.
Returns:
``self.n x self.nbins x self.nbins`` array
"""
if np.sum(np.abs(self.cov())) == 0.0:
return np.tile(np.eye(self.nbins), (self.n, 1, 1))
return cov2corr(self.cov())
[docs] def err(self, relative=False) -> np.ndarray:
"""Return errors per bin, i.e.
:math:`e_i^{(n)} = \\sqrt{\\mathrm{Cov}(d^{(n)}_i, d^{(n)}_i)}`
Args:
relative: Relative errors, i.e. :math:`e_i^{(n)}/d_i^{(n)}`
Returns:
``self.n x self.nbins`` array
"""
if not relative:
return cov2err(self.cov())
else:
return cov2err(self.cov()) / self.data()
# **************************************************************************
# Configuration
# **************************************************************************
[docs] def reset_errors(self) -> None:
"""Set all errors back to 0.
Returns:
None
"""
self.rel_cov = None
self.abs_cov = None
self.poisson_errors = False
self.poisson_errors_scale = 1
# -------------------------------------------------------------------------
# Add absolute errors
# -------------------------------------------------------------------------
[docs] def add_err_cov(self, cov) -> None:
"""Add error from covariance matrix.
Args:
cov: ``self.n x self.nbins x self.nbins`` array of covariance
matrices or self.nbins x self.nbins covariance matrix (if equal
for all data points)
"""
cov = self._interpret_input(cov, "cov")
self.abs_cov += cov
[docs] def add_err_corr(self, err, corr) -> None:
"""Add error from errors vector and correlation matrix.
Args:
err: ``self.n x self.nbins`` vector of errors for each data point
and bin or self.nbins vector of uniform errors per data point or
float (uniform error per bin and datapoint)
corr: ``self.n x self.nbins x self.nbins`` correlation matrices
or ``self.nbins x self.nbins`` correlation matrix
"""
err = self._interpret_input(err, "err")
corr = self._interpret_input(corr, "corr")
self.add_err_cov(corr2cov(corr, err))
[docs] def add_err_uncorr(self, err) -> None:
"""
Add uncorrelated error.
Args:
err: see argument of :py:meth:`.add_err_corr`
"""
err = self._interpret_input(err, "err")
corr = np.identity(self.nbins)
self.add_err_corr(err, corr)
[docs] def add_err_maxcorr(self, err) -> None:
"""
Add maximally correlated error.
Args:
err: see argument of :py:meth:`.add_err_corr`
"""
err = self._interpret_input(err, "err")
corr = np.ones((self.nbins, self.nbins))
self.add_err_corr(err, corr)
# -------------------------------------------------------------------------
# Add relative errors
# -------------------------------------------------------------------------
[docs] def add_rel_err_cov(self, cov) -> None:
"""
Add error from "relative" covariance matrix
Args:
cov: see argument of :py:meth:`.add_err_cov`
"""
cov = self._interpret_input(cov, "cov")
self.rel_cov += cov
[docs] def add_rel_err_corr(self, err, corr) -> None:
"""
Add error from relative errors and correlation matrix.
``err=0.1`` means 10% uncertainty.
Args:
err: see argument of :py:meth:`.add_err_corr`
corr: see argument of :py:meth:`.add_err_corr`
"""
err = self._interpret_input(err, "err")
corr = self._interpret_input(corr, "corr")
self.add_rel_err_cov(corr2cov(corr, err))
[docs] def add_rel_err_uncorr(self, err) -> None:
"""
Add uncorrelated relative uncertainty. ``err=0.1`` means 10% uncertainty.
Args:
err: see argument of
:py:meth:`.add_err_corr`
"""
err = self._interpret_input(err, "err")
corr = np.identity(self.nbins)
self.add_rel_err_corr(err, corr)
[docs] def add_rel_err_maxcorr(self, err) -> None:
"""
Add maximally correlated relative error. ``err=0.1`` means 10%
uncertainty.
Args:
err: see argument of :py:meth:`.add_err_corr`
"""
err = self._interpret_input(err, "err")
corr = np.ones((self.nbins, self.nbins))
self.add_rel_err_corr(err, corr)
# -------------------------------------------------------------------------
# Other forms of errors
# -------------------------------------------------------------------------
[docs] def add_err_poisson(self, normalization_scale=1) -> None:
"""
Add poisson errors/statistical errors.
Args:
normalization_scale: Apply poisson errors corresponding to data
normalization scaled up by this factor. For example, if your
data is normalized to 1 and you still want to apply Poisson
errors that correspond to a yield of 200, you can call
``add_err_poisson(200)``. Your data will stay normalized, but
the poisson errors are appropriate for a total yield of 200.
Returns:
None
"""
if self.poisson_errors:
self.log.warning("Poisson errors had already been added before.")
if normalization_scale != self.poisson_errors_scale:
self.log.warning(
"However we CHANGED (not added to) the scaling of the "
"Poisson errors."
)
self.poisson_errors = True
self.poisson_errors_scale = normalization_scale
# **************************************************************************
# Quick plots
# **************************************************************************
[docs] def plot_dist_err(
self,
cluster_column="cluster",
bpoint_column="bpoint",
title: Optional[str] = None,
clusters: Optional[List[int]] = None,
bpoints=True,
legend=True,
hist_kwargs: Optional[Dict[str, Any]] = None,
hist_fill_kwargs: Optional[Dict[str, Any]] = None,
ax=None,
):
"""Plot distribution with errors.
Args:
cluster_column: Column with the cluster names (default 'cluster')
bpoint_column: Column with bpoints (default 'bpoint')
title: Plot title (``None``: automatic)
clusters: List of clusters to selected or single cluster.
If ``None`` (default), all clusters are chosen.
bpoints: Draw benchmark points if available (default True). If
false or not benchmark points are available, pick a random
sample point for each cluster.
legend: Draw legend? (default True)
hist_kwargs: Keyword arguments to
:meth:`~clusterking.plots.plot_histogram.plot_histogram`
hist_fill_kwargs: Keyword arguments to
:meth:`~clusterking.plots.plot_histogram.plot_histogram_fill`
ax: Instance of `matplotlib.axes.Axes` to plot on. If ``None``, a
new one is instantiated.
Note: To customize these kind of plots further, check the
:py:class:`~clusterking.plots.plot_bundles.BundlePlot` class and the
:py:meth:`~clusterking.plots.plot_bundles.BundlePlot.err_plot`
method thereof.
Returns:
Figure
"""
from clusterking.plots.plot_bundles import BundlePlot
bp = BundlePlot(self)
bp.cluster_column = cluster_column
bp.bpoint_column = bpoint_column
bp.title = title
bp.draw_legend = legend
bp.err_plot(
clusters=clusters,
bpoints=bpoints,
hist_kwargs=hist_kwargs,
hist_fill_kwargs=hist_fill_kwargs,
ax=ax,
)
return bp.fig