Source code for clusterking.maths.metric

#!/usr/bin/env python3

# 3rd
import numpy as np
import scipy.spatial
import functools
from typing import Callable

# ours
from clusterking.data.dwe import DataWithErrors


[docs]def condense_distance_matrix(matrix):
    """ Convert a square-form distance matrix  to a vector-form distance vector

    Args:
        matrix: n x n symmetric matrix with 0 diagonal

    Returns:
        n choose 2 vector
    """
    return scipy.spatial.distance.squareform(matrix)


[docs]def uncondense_distance_matrix(vector):
    """ Convert a vector-form distance vector to a square-form distance matrix

    Args:
        vector: n choose 2 vector

    Returns:
        n x n symmetric matrix with 0 diagonal
    """
    return scipy.spatial.distance.squareform(vector)


[docs]def metric_selection(*args, **kwargs) -> Callable:
    """ Select a metric in one of the following ways:

    1. If no positional arguments are given, we choose the euclidean metric.
    2. If the first positional argument is string, we pick one of the metrics
      that are defined in ``scipy.spatical.distance.pdist`` by that name (all
      additional arguments will be past to this function).
    3. If the first positional argument is a function, we take this function
    (and add all additional arguments to it).

    Examples:

    * ``...()``: Euclidean metric
    * ``...("euclidean")``: Also Euclidean metric
    * ``...(lambda data: scipy.spatial.distance.pdist(data.data(),
      'euclidean')``: Also Euclidean metric
    * ``...("minkowski", p=2)``: Minkowsky distance with ``p=2``.

    See
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
    for more information.

    Args:
        *args:
        **kwargs:

    Returns:
        Function that takes Data object as only parameter and returns a
        reduced distance matrix.
    """
    if len(args) == 0:
        # default
        args = ['euclidean']
    if isinstance(args[0], str):
        # The user can specify any of the metrics from
        # scipy.spatial.distance.pdist by name and supply additional
        # values
        return lambda data: scipy.spatial.distance.pdist(
            data.data(),
            args[0],
            *args[1:],
            **kwargs
        )
    elif isinstance(args[0], Callable):
        # Assume that this is a function that takes DWE or Data as first
        # argument
        return functools.partial(args[0], *args[1:], **kwargs)
    else:
        raise ValueError(
            "Invalid type of first argument: {}".format(type(args[0]))
        )


# todo: unittest
[docs]def chi2_metric(dwe: DataWithErrors, output='condensed'):
    """
    Returns the chi2/ndf values of the comparison of a datasets.

    Args:
        dwe:
        output: 'condensed' (condensed distance matrix) or 'full' (full distance
            matrix)

    Returns:
        Condensed distance matrix

    """
    # https://root.cern.ch/doc/master/classTH1.html#a6c281eebc0c0a848e7a0d620425090a5

    # n vector
    n = dwe.norms()  # todo: this stays untouched by decorrelation, right?
    # n x nbins
    d = dwe.data(decorrelate=True)
    # n x nbins
    e = dwe.err()

    # n x n x nbins
    nom1 = np.einsum("k,li->kli", n, d)
    nom2 = np.transpose(nom1, (1, 0, 2))
    nominator = np.square(nom1 - nom2)

    # n x n x nbins
    den1 = np.einsum("k,li->kli", n, e)
    den2 = np.transpose(den1, (1, 0, 2))
    denominator = np.square(den1) + np.square(den2)

    # n x n x nbins
    summand = nominator / denominator

    # n x n
    chi2ndf = np.einsum("kli->kl", summand) / dwe.nbins

    if output == 'condensed':
        return condense_distance_matrix(chi2ndf)
    elif output == 'full':
        return chi2ndf
    else:
        raise ValueError("Unknown argument '{}'.".format(output))