Source code for clusterking.benchmark.benchmark

#!/usr/bin/env python3

# std

# 3rd
import numpy as np
from typing import Callable

# ours
from clusterking.benchmark.abstract_benchmark import AbstractBenchmark
from clusterking.util.metadata import failsafe_serialize
from clusterking.maths.metric import uncondense_distance_matrix, \
    metric_selection


# todo: test this
[docs]class Benchmark(AbstractBenchmark): """ Selecting benchmarks based on a figure of merit that is calculated with the metric. You have to use :py:meth:`~clusterking.bpoints.Benchmark.set_metric` to specify the metric (as for the :py:class:`~clusterking.cluster.HierarchyCluster` class). The default case for the figure of merit ("sum") chooses the point as benchmark point that minimizes the sum of all distances to all other points in the same cluster (where "distance" of course is with respect to the metric). """
[docs] def __init__(self, data, cluster_column="cluster"): """ Args: data: :py:class:`~clusterking.data.data.Data` object cluster_column: Column name of the clusters """ super().__init__(data=data, cluster_column=cluster_column) self.metric = None self.fom = lambda x: np.sum(x, axis=1)
# Docstring set below
[docs] def set_metric(self, *args, **kwargs) -> None: self.md["metric"]["args"] = failsafe_serialize(args) self.md["metric"]["kwargs"] = failsafe_serialize(kwargs) self.metric = metric_selection(*args, **kwargs)
set_metric.__doc__ = metric_selection.__doc__
[docs] def set_fom(self, fct: Callable, *args, **kwargs) -> None: """ Set a figure of merit. The default case for the figure of merit ( "sum") chooses the point as benchmark point that minimizes the sum of all distances to all other points in the same cluster (where "distance" of course is with respect to the metric). In general we choose the point that minimizes ``self.fom(<metric>)``, i.e. the default case corresponds to ``self.fom = lambda x: np.sum(x, axis=1)``, which you could have also set by calling ``self.set_com(np.sum, axis=1)``. Args: fct: Function that takes the metric as first argument *args: Positional arguments that are added to the positional arguments of ``fct`` after the metric **kwargs: Keyword arguments for the function Returns: None """ self.fom = lambda metric: fct(metric, *args, **kwargs)
def _select_bpoints(self): if self.metric is None: self.log.error( "Metric not set. please run self.set_metric or set " "self.metric manually before running this method. " "Returning without doing anything." ) return result = np.full(self.data.n, False, bool) for cluster in set(self._clusters): # The indizes of all spoints that are in the current cluster indizes = np.squeeze(np.argwhere(self._clusters == cluster), axis=1) # A data object with only these spoints d_cut = type(self.data)( df=self.data.df.iloc[indizes], md=self.data.md ) m = self.fom(uncondense_distance_matrix(self.metric(d_cut))) # The index of the wpoint of the current cluster that has the lowest # sum of distances to all other elements in the same cluster index_minimal = indizes[np.argmin(m)] result[index_minimal] = True return result