Source code for clusterking.cluster.cluster

#!/usr/bin/env python3

"""Read the results from scan.py and get_clusters them.
"""

# std
import time

# us
from clusterking.util.metadata import git_info, nested_dict
from clusterking.util.log import get_logger
from clusterking.data.data import Data


[docs]class Cluster(object):
    """ Abstract baseclass of the Cluster classes. This class is subclassed to
    implement specific clustering algorithms and defines common functions.
    """
[docs]    def __init__(self, data: Data):
        self.log = get_logger("Scanner")

        self.data = data
        self.clusters = None
        # self.bpoints = None

        #: Metadata
        self.md = nested_dict()

        self.md["git"] = git_info(self.log)
        self.md["time"] = time.strftime(
            "%a %_d %b %Y %H:%M", time.gmtime()
        )

[docs]    def cluster(self, **kwargs):
        """ Performs the clustering.
        This method is a wrapper around the _cluster implementation in the
        subclasses. See there for additional arguments.
        """
        self.log.info("Performing clustering.")

        self.md["cluster_args"] = kwargs

        self.clusters = self._cluster(**kwargs)

        n_clusters = len(set(self.clusters))
        self.log.info(
            "Clustering resulted in {} get_clusters.".format(n_clusters)
        )
        self.md["n_clusters"] = n_clusters

        self.log.info("Done")

    def _cluster(self, **kwargs):
        """ Implementation of the clustering. Should return an array-like object
        with the cluster number.
        """
        raise NotImplementedError

    # todo: overwrite argument?
[docs]    def write(self, cluster_column="cluster"):
        """ Write results back in the :py:class:`~clusterking.data.data.Data`
        object. """
        self.data.df[cluster_column] = self.clusters
        self.data.md["cluster"][cluster_column] = self.md
        self.data.rename_clusters(column=cluster_column)