Source code for clusterking.cluster.hierarchy_cluster

#!/usr/bin/env python3

# std
import pathlib
from typing import Union, Callable

# 3rd
import scipy.cluster
import matplotlib.pyplot as plt
import scipy.spatial

# ours
from clusterking.cluster.cluster import Cluster
from clusterking.util.metadata import failsafe_serialize
from clusterking.maths.metric import metric_selection


# todo: Function to save/load hierarchy?

# todo: document
[docs]class HierarchyCluster(Cluster):
[docs] def __init__(self, data): super().__init__(data) self.hierarchy = None #: Function that, applied to Data or DWE object returns the metric as #: a condensed distance matrix. self.metric = None # type: Callable
# Docstring set below
[docs] def set_metric(self, *args, **kwargs) -> None: self.md["metric"]["args"] = failsafe_serialize(args) self.md["metric"]["kwargs"] = failsafe_serialize(kwargs) self.metric = metric_selection(*args, **kwargs)
set_metric.__doc__ = metric_selection.__doc__
[docs] def build_hierarchy(self, method="complete", optimal_ordering=False) \ -> None: """ Build the hierarchy object. Args: method: See reference on scipy.cluster.hierarchy.linkage optimal_ordering: See reference on scipy.cluster.hierarchy.linkage """ if self.metric is None: msg = "Metric not set. please run self.set_metric or set " \ " self.metric manually before running this method. " \ "Returning without doing anything." self.log.critical(msg) raise ValueError(msg) self.log.debug("Building hierarchy.") md = self.md["hierarchy"] md["method"] = method md["optimal_ordering"] = optimal_ordering self.hierarchy = scipy.cluster.hierarchy.linkage( self.metric(self.data), method=method, optimal_ordering=optimal_ordering ) self.log.debug("Done")
def _cluster(self, max_d=0.2, **kwargs): """Performs the actual clustering Args: max_d: **kwargs: Returns: None """ if self.hierarchy is None: msg = "Please run build_hierarchy first to set self.hierarchy or" \ "manually set HieararchyCluster.hierachy." self.log.critical(msg) raise ValueError(msg) # set up defaults for clustering here # (this way we can overwrite them with additional arguments) fcluster_config = { "criterion": "distance" } fcluster_config.update(kwargs) # noinspection PyTypeChecker clusters = scipy.cluster.hierarchy.fcluster( self.hierarchy, max_d, **fcluster_config ) return clusters
[docs] def dendrogram( self, output: Union[None, str, pathlib.Path] = None, ax=None, show=False, **kwargs ) -> Union[plt.Axes, None]: """Creates dendrogram Args: output: If supplied, we save the dendrogram there ax: An axes object if you want to add the dendrogram to an existing axes rather than creating a new one show: If true, the dendrogram is shown in a viewer. **kwargs: Additional keyword options to scipy.cluster.hierarchy.dendrogram Returns: The matplotlib.pyplot.Axes object """ self.log.debug("Plotting dendrogram.") if self.hierarchy is None: self.log.error("Hierarchy not yet set up. Returning without " "doing anything.") return # do we add to a plot or generate a whole new figure? if ax: fig = ax.get_figure() else: fig, ax = plt.subplots() labelsize = 20 ax.set_title('Hierarchical Clustering Dendrogram', fontsize=labelsize) ax.set_xlabel('ID', fontsize=labelsize) ax.set_ylabel('Distance', fontsize=labelsize) # set defaults for dendrogram plotting options here # (this way we can overwrite them with additional arguments) den_config = { "color_threshold": "default", "leaf_rotation": 90., # rotates the x axis labels "leaf_font_size": 8, # font size for the x axis labels } den_config.update(kwargs) scipy.cluster.hierarchy.dendrogram( self.hierarchy, ax=ax, **den_config ) if show: fig.show() if output: output = pathlib.Path(output) if not output.parent.is_dir(): self.log.debug("Creating dir '{}'.".format(output.parent)) output.parent.mkdir(parents=True) fig.savefig(output, bbox_inches="tight") self.log.info("Wrote dendrogram to '{}'.".format(output)) return ax