Source code for clusterking.stability.subsamplestability

#!/usr/bin/env python3

# std
from typing import Iterable, Optional
import collections
import copy

# 3rd
import tqdm.auto
import pandas as pd

# ours
from clusterking.stability.stabilitytester import (
    AbstractStabilityTester,
    SimpleStabilityTesterResult,
)
from clusterking.data.data import Data
from clusterking.cluster import Cluster
from clusterking.benchmark import AbstractBenchmark


[docs]class SubSampleStabilityTesterResult(SimpleStabilityTesterResult):
    pass


[docs]class SubSampleStabilityTester(AbstractStabilityTester):
    """Test the stability of clustering algorithms by repeatedly
    clustering subsamples of data.

    Example:

    .. code-block:: python

        ssst = SubSampleStabilityTester()
        ssst.set_sampling(frac=0.99)
        ssst.set_repeat(50)

        d = ck.Data(path)

        c = ck.cluster.HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(0.2)
        c.run(data=d).write()

        b = Benchmark()
        b.set_metric("euclidean")
        b.run(data=d).write()

        ssstr = ssst.run(data=d, cluster=c, benchmark=b)
    """

[docs]    def __init__(self):
        super().__init__()
        #: Fraction of sample points to be contained in the subsamples.
        #: Set using :meth:`set_basic_config`.
        self._sample_kwargs = {}
        #: Number of subsamples to consider.
        #: Set using :meth:`set_basic_config`.
        self._repeat = None
        #: Display a progress bar?
        self._progress_bar = True

        # Set default values:
        self.set_repeat()
        self.set_progress_bar()

    # **************************************************************************
    # Config
    # **************************************************************************

[docs]    def set_sampling(self, **kwargs) -> None:
        """Configure the subsampling of the data. If performing
        benchmarking, it is ensured that none of the benchmark points of the
        original dataframe are removed during subsampling (to allow to
        compare the benchmarking results).

        Args:
            **kwargs: Keyword arguments to
                :meth:`clusterking.data.Data.sample_param_random`, in particular
                keyword arguments to :meth:`pandas.DataFrame.sample`.

        Returns:
            None

        Example:

        .. code-block:: python

            ssst.set_sampling(n=100)     # Sample 100 points
            ssst.set_sampling(frac=0.9)  # Sample 90% of the points

        """
        self._sample_kwargs = kwargs

[docs]    def set_repeat(self, repeat=100) -> None:
        """

        Args:
            repeat: Number of subsamples to test

        Returns:
            None
        """
        self._repeat = repeat

[docs]    def set_progress_bar(self, state=True) -> None:
        """Set or unset progress bar.

        Args:
            state: Bool: Display progress bar?

        Returns:
            None
        """
        self._progress_bar = state

    # **************************************************************************
    # Run
    # **************************************************************************

[docs]    def run(
        self,
        data: Data,
        cluster: Cluster,
        benchmark: Optional[AbstractBenchmark] = None,
    ) -> SubSampleStabilityTesterResult:
        """Run test.

        Args:
            data: :class:`~clusterking.data.Data` object
            cluster: Pre-configured :class:`~clusterking.cluster.Cluster`
                object
            benchmark: Optional: :class:`~clusterking.cluster.cluster.Cluster`
                object

        Returns:
            :class:`SubSampleStabilityTesterResult` object
        """
        if not self._sample_kwargs:
            msg = (
                "You need to configure sampling with set_sampling before "
                "you can run this method."
            )
            raise ValueError(msg)

        original_data = data.copy(deep=True)
        cluster.run(original_data).write()
        if self._progress_bar:
            iterator = tqdm.auto.tqdm(range(self._repeat))
        else:
            iterator = range(self._repeat)
        fom_results = collections.defaultdict(list)

        sample_kwargs = copy.deepcopy(self._sample_kwargs)
        if benchmark is not None and "bpoints" not in self._sample_kwargs:
            sample_kwargs["bpoints"] = True

        for _ in iterator:
            this_data = data.sample_param_random(**sample_kwargs)
            cluster.run(this_data).write()
            if benchmark is not None:
                benchmark.run(this_data).write()
            for fom_name, fom in self._foms.items():
                try:
                    fom = fom.run(original_data, this_data).fom
                except ValueError:
                    fom = -1
                fom_results[fom_name].append(fom)

        df = pd.DataFrame(fom_results)
        return SubSampleStabilityTesterResult(df=df)


[docs]class SubSampleStabilityVsFractionResult(SimpleStabilityTesterResult):
    pass


[docs]class SubSampleStabilityVsFraction(object):
    """Repeatedly run :class:`SubSampleStabilityTester` for different
    fractions.
    """

[docs]    def __init__(self):
        pass

[docs]    def run(
        self,
        data: Data,
        cluster: Cluster,
        ssst: SubSampleStabilityTester,
        fractions: Iterable[float],
    ):
        results = collections.defaultdict(list)
        ssst.set_progress_bar(False)
        for fract in tqdm.auto.tqdm(fractions):
            ssst.set_sampling(frac=fract)
            r = ssst.run(data, cluster)
            foms = r.df.mean().to_dict()
            results["fraction"].append(fract)
            for key, value in foms.items():
                results[key].append(value)
        df = pd.DataFrame(results)
        return SubSampleStabilityVsFractionResult(df=df)
ClusterKinG 1.1.0 documentation

Source code for clusterking.stability.subsamplestability