Source code for clusterking.stability.subsamplestability

#!/usr/bin/env python3

# std
from typing import Iterable, Optional
import collections
import copy

# 3rd
import tqdm.auto
import pandas as pd

# ours
from clusterking.stability.stabilitytester import (
    AbstractStabilityTester,
    SimpleStabilityTesterResult,
)
from clusterking.data.data import Data
from clusterking.cluster import Cluster
from clusterking.benchmark import AbstractBenchmark


[docs]class SubSampleStabilityTesterResult(SimpleStabilityTesterResult): pass
[docs]class SubSampleStabilityTester(AbstractStabilityTester): """Test the stability of clustering algorithms by repeatedly clustering subsamples of data. Example: .. code-block:: python ssst = SubSampleStabilityTester() ssst.set_sampling(frac=0.99) ssst.set_repeat(50) d = ck.Data(path) c = ck.cluster.HierarchyCluster() c.set_metric("euclidean") c.set_max_d(0.2) c.run(data=d).write() b = Benchmark() b.set_metric("euclidean") b.run(data=d).write() ssstr = ssst.run(data=d, cluster=c, benchmark=b) """
[docs] def __init__(self): super().__init__() #: Fraction of sample points to be contained in the subsamples. #: Set using :meth:`set_basic_config`. self._sample_kwargs = {} #: Number of subsamples to consider. #: Set using :meth:`set_basic_config`. self._repeat = None #: Display a progress bar? self._progress_bar = True # Set default values: self.set_repeat() self.set_progress_bar()
# ************************************************************************** # Config # **************************************************************************
[docs] def set_sampling(self, **kwargs) -> None: """Configure the subsampling of the data. If performing benchmarking, it is ensured that none of the benchmark points of the original dataframe are removed during subsampling (to allow to compare the benchmarking results). Args: **kwargs: Keyword arguments to :meth:`clusterking.data.Data.sample_param_random`, in particular keyword arguments to :meth:`pandas.DataFrame.sample`. Returns: None Example: .. code-block:: python ssst.set_sampling(n=100) # Sample 100 points ssst.set_sampling(frac=0.9) # Sample 90% of the points """ self._sample_kwargs = kwargs
[docs] def set_repeat(self, repeat=100) -> None: """ Args: repeat: Number of subsamples to test Returns: None """ self._repeat = repeat
[docs] def set_progress_bar(self, state=True) -> None: """Set or unset progress bar. Args: state: Bool: Display progress bar? Returns: None """ self._progress_bar = state
# ************************************************************************** # Run # **************************************************************************
[docs] def run( self, data: Data, cluster: Cluster, benchmark: Optional[AbstractBenchmark] = None, ) -> SubSampleStabilityTesterResult: """Run test. Args: data: :class:`~clusterking.data.Data` object cluster: Pre-configured :class:`~clusterking.cluster.Cluster` object benchmark: Optional: :class:`~clusterking.cluster.cluster.Cluster` object Returns: :class:`SubSampleStabilityTesterResult` object """ if not self._sample_kwargs: msg = ( "You need to configure sampling with set_sampling before " "you can run this method." ) raise ValueError(msg) original_data = data.copy(deep=True) cluster.run(original_data).write() if self._progress_bar: iterator = tqdm.auto.tqdm(range(self._repeat)) else: iterator = range(self._repeat) fom_results = collections.defaultdict(list) sample_kwargs = copy.deepcopy(self._sample_kwargs) if benchmark is not None and "bpoints" not in self._sample_kwargs: sample_kwargs["bpoints"] = True for _ in iterator: this_data = data.sample_param_random(**sample_kwargs) cluster.run(this_data).write() if benchmark is not None: benchmark.run(this_data).write() for fom_name, fom in self._foms.items(): try: fom = fom.run(original_data, this_data).fom except ValueError: fom = -1 fom_results[fom_name].append(fom) df = pd.DataFrame(fom_results) return SubSampleStabilityTesterResult(df=df)
[docs]class SubSampleStabilityVsFractionResult(SimpleStabilityTesterResult): pass
[docs]class SubSampleStabilityVsFraction(object): """Repeatedly run :class:`SubSampleStabilityTester` for different fractions. """
[docs] def __init__(self): pass
[docs] def run( self, data: Data, cluster: Cluster, ssst: SubSampleStabilityTester, fractions: Iterable[float], ): results = collections.defaultdict(list) ssst.set_progress_bar(False) for fract in tqdm.auto.tqdm(fractions): ssst.set_sampling(frac=fract) r = ssst.run(data, cluster) foms = r.df.mean().to_dict() results["fraction"].append(fract) for key, value in foms.items(): results[key].append(value) df = pd.DataFrame(results) return SubSampleStabilityVsFractionResult(df=df)