Source code for clusterking.plots.plot_bundles

#!/usr/bin/env python3

# std
import logging
import random
from typing import List, Iterable, Union

# 3rd party
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd

import matplotlib.animation as animation

# ours
from clusterking.util.log import get_logger
from clusterking.plots.plot_histogram import plot_histogram
from clusterking.plots.colors import ColorScheme


def get_random_indizes(start: int, stop: int, n: int) -> List[int]:
    """ Generate a list of n distinct (!) random integers.

    Args:
        start: Minimum of index (start <= index)
        stop: Maximum of index (index < stop)
        n: Number of distinct random indizes to be generated

    Returns:
        List `number` many (different) random indizes
    """
    indizes = set()
    iterations = 0
    while len(indizes) < n:
        indizes.add(random.randrange(start, stop))
        if iterations >= 10 * n:
            print(
                "Did not manage to generate enough different random "
                "integers (only {} of {}).".format(len(indizes), n)
            )
            break
    return sorted(list(indizes))


[docs]class BundlePlot(object):
    """ Plotting class to plot distributions by cluster in order to analyse
    which distributions get assigned to which cluster. """
[docs]    def __init__(self, data):
        """

        Args:
            data: :py:class:`~clusterking.data.data.Data` object
        """

        #: logging.Logger object
        self.log = get_logger("BundlePlot", sh_level=logging.WARNING)

        #: pandas dataframe
        self.data = data

        #: Name of the column holding the cluster number
        self.cluster_column = "cluster"

        self.bpoint_column = "bpoint"

        #: Color scheme
        self.color_scheme = ColorScheme(self._clusters)

        #: Draw legend?
        self.draw_legend = True

        #: Override default titles with this title. If None, the default title
        #:  is used.
        self.title = None

        #: Instance of matplotlib.axes.Axes
        self.ax = None

    @property
    def fig(self):
        """ Instance of matplotlib.pyplot.figure """
        return self.ax.get_figure()

    # **************************************************************************
    # Internal helpers
    # **************************************************************************

    @property
    def _has_bpoints(self):
        """ Do we have benchmark points? """
        return self.bpoint_column in self.data.df.columns

    @property
    def _clusters(self):
        """ Return array of all distinct clusters. """
        return self.data.clusters(cluster_column=self.cluster_column)

    def _filter_clusters(self, clusters: Iterable[int]) -> List[int]:
        """ Return list of existing clusters only. """
        clusters = list(set(clusters))
        selection = [c for c in clusters if c in self._clusters]
        removed = [c for c in clusters if c not in self._clusters]
        if removed:
            self.log.warning(
                "The cluster(s) {} does not exist in data, "
                "so I removed them.".format(
                    ", ".join(map(str, sorted(removed)))
                )
            )
        return selection

    def _interpret_cluster_input(self, clusters=None) -> List[int]:
        """ Flexible handling of user specifications for clusters.

        Args:
            clusters: Either None (all clusters) a single int (just that
                cluster or a list of clusters).

        Returns:
            list of selected clusters
        """
        if isinstance(clusters, int):
            clusters = [clusters]
        if not clusters:
            clusters = self._clusters
        return self._filter_clusters(clusters)

    def _get_df_cluster(self, cluster: int, bpoint=None) -> pd.DataFrame:
        """ Return only the rows corresponding to one cluster in the
        dataframe and only the columns that correspond to the bins.

        Args:
            cluster: Name of the cluster
            bpoint: If True, return benchmark point, if False, return all non-
                benchmark points, if None, return everything.

        Returns:
            pandas.DataFrame as described above
        """
        # to avoid long line:
        cc = self.cluster_column
        bc = self.data.bin_cols
        df = self.data.df[self.data.df[cc] == cluster]
        if bpoint is None:
            return df[bc]
        elif bpoint is False:
            if self._has_bpoints:
                return df[
                    df[self.bpoint_column] == False
                ][bc]
            else:
                return df[bc]
        elif bpoint is True:
            if self._has_bpoints:
                return df[
                    df[self.bpoint_column] == True
                ][bc]
            else:
                return pd.DataFrame()
        else:
            raise ValueError("Invalid argument bpoint=={}".format(bpoint))

    def _set_ax(self, ax, title):
        """ Set up axes. """
        if self.title is not None:
            title = self.title
        if not ax:
            fig, ax = plt.subplots()
            self.ax = ax
        ax.set_title(title)

    # **************************************************************************
    # Plots
    # **************************************************************************

    # --------------------------------------------------------------------------
    # Legend
    # --------------------------------------------------------------------------

    def _draw_legend(self, clusters=None):
        if not self.draw_legend:
            return
        clusters = self._interpret_cluster_input(clusters)
        if len(clusters) <= 1:
            return
        legend_elements = []
        for cluster in clusters:
            color = self.color_scheme.get_cluster_color(cluster)
            # pycharm can't seem to find patches:
            # noinspection PyUnresolvedReferences
            p = matplotlib.patches.Patch(
                facecolor=color,
                edgecolor=color,
                label=cluster,
            )
            legend_elements.append(p)
        self.ax.legend(
            handles=legend_elements,
            loc='best',
            title="Clusters",
            frameon=False
        )

    # --------------------------------------------------------------------------
    # Benchmark points + more lines
    # --------------------------------------------------------------------------

    def _plot_bundles(self, cluster: int, nlines=0, benchmark=True) -> None:
        """ Main implementation of self.plot_bundles (private method).
        This method will be called for each cluster in self.plot_bundles.

        Args:
            cluster: Number of cluster to be plotted
            nlines: Number of example distributions of the cluster to be
                plotted

        Returns:
            None
        """

        df_cluster_no_bp = self._get_df_cluster(cluster, bpoint=False)
        if len(df_cluster_no_bp) < nlines:
            self.log.warning(
                "Not enough rows for cluster {} "
                "Only plotting {} lines.".format(cluster, len(df_cluster_no_bp))
            )
            nlines = len(df_cluster_no_bp)
        df_cluster_bp = self._get_df_cluster(cluster, bpoint=True)

        indizes = get_random_indizes(0, len(df_cluster_no_bp), nlines)
        color = self.color_scheme.get_cluster_color(cluster)
        colors = self.color_scheme.get_cluster_colors_faded(cluster, nlines)
        if nlines == 1 and not benchmark:
            # Do not use faded out color if we just plot one line
            colors = [color]
        for i, index in enumerate(indizes):
            data = np.squeeze(df_cluster_no_bp.iloc[[index]].values)
            plot_histogram(
                self.ax,
                None,
                data,
                color=colors[i],
                linestyle="-"
            )
        if self._has_bpoints and benchmark:
            plot_histogram(
                self.ax,
                None,
                df_cluster_bp.values,
                color=color,
            )

[docs]    def plot_bundles(self, clusters: Union[int, Iterable[int]] = None, nlines=0,
                     ax=None, bpoints=True) -> None:
        """ Plot several examples of distributions for each cluster specified

        Args:
            clusters: List of clusters to selected or single cluster.
                If None (default), all clusters are chosen.
            nlines: Number of example distributions of each cluster to be
                plotted
            ax: Instance of matplotlib.axes.Axes to be plotted on. If None
                (default), a new axes object and figure is initialized and
                saved as self.ax and self.fig.
            bpoints: Draw benchmark curve

        Returns:
            None
        """
        clusters = self._interpret_cluster_input(clusters)

        title = ""
        if self._has_bpoints:
            title = "benchmark point(s) "
        if clusters:
            title += "+ {} sample point(s) ".format(nlines)
        title += "for cluster(s) {}".format(
            ", ".join(map(str, sorted(clusters)))
        )
        self._set_ax(ax, title)

        # pycharm might be confused about the type of `clusters`:
        # noinspection PyTypeChecker
        for cluster in clusters:
            self._plot_bundles(cluster, nlines=nlines, benchmark=bpoints)

        self._draw_legend(clusters)

[docs]    def animate_bundle(self, cluster, n, benchmark=True):
        # There seems to be some underlying magic here with fig
        fig = plt.figure()
        ax = fig.gca()
        self.ax = ax
        linestyle = "-"
        if benchmark:
            self._plot_bundles(cluster, 0, benchmark=True)
            linestyle = "--"
        ims = []
        df_cluster_no_bp = self._get_df_cluster(cluster, bpoint=False)
        color = self.color_scheme.get_cluster_color(cluster)
        for i in range(n):
            index = random.randrange(0, len(df_cluster_no_bp))
            contents = np.squeeze(df_cluster_no_bp.iloc[[index]].values)
            contents = np.append(contents, contents[-1])
            edges = np.arange(len(contents))

            ims.append(plt.step(
                edges,
                contents,
                where="post",
                color=color,
                linestyle=linestyle
            ))

        # self._set_ax(None, "Animated sample points")
        anim = animation.ArtistAnimation(
            fig,
            ims,
            interval=500,
            repeat_delay=3000,
            blit=True
        )
        # In order to display this in the notebook, use
        # from IPython.display import HTML
        # HTML(anim.to_html5_video())
        return anim

    # --------------------------------------------------------------------------
    # Minima/Maxima of bin content for each cluster
    # --------------------------------------------------------------------------

    def _plot_minmax(self, cluster: int, bpoints=True) -> None:
        """ Main implementation of self.plot_minmax.
        This method will be called for each cluster in self.plot_minmax.

        Args:
            cluster: Name of cluster to be plotted
            bpoints: Plot reference


        Returns:
            None
        """
        df_cluster = self._get_df_cluster(cluster)
        maxima = list(df_cluster.max().values)
        minima = list(df_cluster.min().values)

        bin_numbers = np.array(range(0, len(self.data.bin_cols) + 1))

        color = self.color_scheme.get_cluster_color(cluster)
        for i in range(len(maxima)):
            x = bin_numbers[i:i+2]
            y1 = [minima[i], minima[i]]
            y2 = [maxima[i], maxima[i]]
            self.ax.fill_between(
                x,
                y1,
                y2,
                facecolor=color,
                interpolate=False,
                alpha=0.3,
                hatch="////",
                color=color
            )
        if bpoints:
            self._plot_bundles(cluster, nlines=0)

[docs]    def plot_minmax(self, clusters: Union[int, Iterable[int]] = None,
                    ax=None, bpoints=True) -> None:
        """ Plot the minimum and maximum of each bin for the specified
        clusters.

        Args:
            clusters:  List of clusters to selected or single cluster.
                If None (default), all clusters are chosen.
            ax: Instance of ``matplotlib.axes.Axes`` to plot on. If None, a new
                one is instantiated.
            bpoints: Plot reference

        Returns:
            None
        """
        clusters = self._interpret_cluster_input(clusters)

        title = "Minima and maxima of the bin contents for cluster(s)" \
                " {}".format(', '.join(map(str, sorted(clusters))))
        self._set_ax(ax, title)

        # pycharm might be confused about the type of `clusters`:
        # noinspection PyTypeChecker
        for cluster in clusters:
            self._plot_minmax(cluster, bpoints=bpoints)

        self._draw_legend(clusters)

    # --------------------------------------------------------------------------
    # Box plots
    # --------------------------------------------------------------------------

    def _box_plot(self, cluster, whiskers=1.5, bpoints=True) -> None:
        """ Main implementation of self.box_plot.
        Gets called for every cluster specified in self.box_plot.

        Args:
            cluster: Name of cluster to be plotted
            whiskers: Length of the whiskers of the box plot.
                See self.box_plot for more information.
                Default: 1.5 (matplotlib default)

        Returns:
            None
        """
        df_cluster = self._get_df_cluster(cluster)
        data = df_cluster.values

        color = self.color_scheme.get_cluster_color(cluster)

        # print(len(data.T))

        self.ax.boxplot(
            data,
            notch=False,
            positions=np.array(range(len(data.T))) + 0.5,
            vert=True,
            patch_artist=True,
            boxprops=dict(facecolor=color, color=color, alpha=0.3),
            capprops=dict(color=color),
            whiskerprops=dict(color=color),
            flierprops=dict(color=color, markeredgecolor=color),
            medianprops=dict(color=color),
            whis=whiskers  # extend the range of the whiskers
        )
        if bpoints:
            self._plot_bundles(cluster, nlines=0)

[docs]    def box_plot(self, clusters: Union[int, Iterable[int]] = None, ax=None,
                 whiskers=2.5, bpoints=True) -> None:
        """ Box plot of the bin contents of the distributions corresponding
        to selected clusters.

        Args:
            clusters:  List of clusters to selected or single cluster.
                If None (default), all clusters are chosen.
            ax: Instance of matplotlib.axes.Axes to plot on. If None, a new
                one is instantiated.
            whiskers: Length of the whiskers of the box plot in units of IQR
                (interquartile range, containing 50% of all values). Default
                2.5.
            bpoints: Draw benchmarks?
        """
        clusters = self._interpret_cluster_input(clusters)
        title = "Box plot of the bin contents for cluster(s) {}\n" \
                "Whisker length set to {}*IQR".format(
                    ", ".join(map(str, sorted(clusters))),
                    whiskers
                )
        self._set_ax(ax, title)
        # pycharm might be confused about the type of `clusters`:
        # noinspection PyTypeChecker
        for cluster in clusters:
            self._box_plot(cluster, whiskers=whiskers, bpoints=bpoints)

        self._draw_legend(clusters)