Source code for clusterking.scan.scanner

#!/usr/bin/env python3

""" Scans the NP parameter space in a grid and also q2, producing the
normalized q2 distribution. """

# std
import functools
import multiprocessing
import os
import time
from typing import Callable, Sized, Dict, Iterable, Optional, List
import itertools

# 3rd party
import numpy as np
import pandas as pd
import tqdm.auto

# ours
from clusterking.worker import DataWorker
from clusterking.data.data import Data
import clusterking.maths.binning
from clusterking.util.metadata import (
    version_info,
    failsafe_serialize,
    nested_dict,
)
from clusterking.util.log import get_logger
from clusterking.result import DataResult


class SpointCalculator(object):
    """A class that holds the function with which we calculate each
    point in sample space. Note that this has to be a separate class from
    Scanner to avoid problems related to multiprocessing's use of the pickle
    library, which are described here:
    https://stackoverflow.com/questions/1412787/
    """

    def __init__(self):
        # All of these have to be set!
        #: Function to run
        self.func = None
        #: When function should be binned, this should be an array
        #: of the bin edge points, if the function should be sampled, an array
        #: of the sample points.
        self.binning = None
        #: 'sample', 'integrate'
        self.binning_mode = "integrate"
        #: Normalize distribution if binning is specified
        self.normalize = False
        self.kwargs = {}

    # todo: doc
    # todo: ignore static warning
    def _prepare_spoint(self, spoint):
        return spoint

    def calc(self, spoint) -> np.array:
        """Calculates one point in wilson space.

        Args:
            spoint: Wilson coefficients

        Returns:
            np.array of the integration results
        """

        spoint = self._prepare_spoint(spoint)
        if self.binning is not None:
            if self.binning_mode == "integrate":
                return clusterking.maths.binning.bin_function(
                    functools.partial(self.func, spoint, **self.kwargs),
                    self.binning,
                    normalize=self.normalize,
                )
            elif self.binning_mode == "sample":
                func = functools.partial(self.func, spoint, **self.kwargs)
                res = np.array(list(map(func, self.binning)))
                if self.normalize:
                    res /= sum(res)
                print("results", res)
                return res
        else:
            return self.func(spoint, **self.kwargs)


# todo: also allow to disable multiprocessing if there are problems.
[docs]class Scanner(DataWorker): """ This class is set up with a function (specified in :meth:`.set_dfunction`) that depends on points in parameter space and a set of sample points in this parameter space (specified via one of the ``set_spoints_...`` methods). The function is then run for every sample point (in the :meth:`.run` method) and the results are written to a :class:`~clusterking.data.Data`-like object. Usage example: .. code-block:: python import clusterking as ck def myfunction(parameters, x): return sum(parameters) * x # Initialize Scanner class s = ck.scan.Scanner() # Set the function s.set_dfunction(myfunction) # Set the sample points s.set_spoints_equidist({ "a": (-1, 1, 10), "b": (-1, 1, 10) }) # Initialize a Data class to write to: d = ck.data.Data() # Run it r = s.run(d) # Write back results to data r.write() """ # ************************************************************************** # Constructor # **************************************************************************
[docs] def __init__(self): """Initializes the :class:`clusterking.scan.Scanner` class.""" super().__init__() # todo: move self.log = get_logger("Scanner") #: Points in wilson space #: Use self.spoints to access this self._spoints = None # type: Optional[np.ndarray] #: Instance of SpointCalculator to perform the claculations of #: the wilson space points. self._spoint_calculator = SpointCalculator() # todo: move self.md = nested_dict() self.md["git"] = version_info(self.log) self.md["time"] = time.strftime("%a %d %b %Y %H:%M", time.gmtime()) # todo: shouldn't that be in metadata? #: Names of the parameters self._coeffs = [] # type: List[str] self._no_workers = None # type: Optional[int] self._progress_bar = True self._tqdm_kwargs = {} self.set_imaginary_prefix("im_")
# ************************************************************************** # Convenience properties # ************************************************************************** @property def imaginary_prefix(self) -> str: """Prefix for the name of imaginary parts of coefficients. Also see e.g. :meth:`.set_spoints_equidist`. Read only. """ return self.md["imaginary_prefix"] @property def spoints(self): """Points in parameter space that are sampled (read-only).""" return self._spoints @property def coeffs(self): """The name of the parameters/coefficients/dimensions of the spoints (read only). Set after spoints are set. Does **not** include the names of the columns of the imaginary parts. """ return self._coeffs.copy() # ************************************************************************** # Settings # **************************************************************************
[docs] def set_progress_bar(self, show: bool, **kwargs) -> None: """Settings for progress bar Args: show: Show progress bar? **kwargs: Keyword arguments for tqdm progress bar Returns: """ self._progress_bar = show self._tqdm_kwargs = kwargs
[docs] def set_dfunction( self, func: Callable, binning: Optional[Sized] = None, sampling: Optional[Sized] = None, normalize=False, xvar="xvar", yvar="yvar", **kwargs ): """Set the function that generates the distributions that are later clustered (e.g. a differential cross section). Args: func: A function that takes the point in parameter space as the first argument (**Note**: The parameters are given in alphabetically order with respect to the parameter name!). It should either return a ``float`` or a ``np.ndarray``. If the ``binning`` or ``sampling`` options are specified, only ``float`` s as return value are allowed. binning: If this parameter is set to an array-like object, we will integrate the function over the specified bins for every point in parameter space. sampling: If this parameter is set to an array-like object, we will apply the function to these points for every point in parameter space. normalize: If a binning is specified, normalize the resulting distribution. xvar: Name of variable on x-axis yvar: Name of variable on y-axis **kwargs: All other keyword arguments are passed to the function. Returns: None """ if normalize and binning is None and sampling is None: raise ValueError( "The setting normalize=True only makes sense if a binning or " "sampling is specified." ) if binning is not None and sampling is not None: raise ValueError("Please specify EITHER sampling OR binning.") # The block below just wants to put some information about the function # in the metadata. Can be ignored if you're only interested in what's # happening. md = self.md["dfunction"] try: md["name"] = func.__name__ md["doc"] = func.__doc__ except AttributeError: try: # For functools.partial objects # noinspection PyUnresolvedReferences md["name"] = "functools.partial({})".format(func.func.__name__) # noinspection PyUnresolvedReferences md["doc"] = func.func.__doc__ except AttributeError: pass md["kwargs"] = failsafe_serialize(kwargs) md["binning"] = binning # This is the important thing: We set all required attributes of the # spoint calculator! self._spoint_calculator.func = func if binning is not None: self._spoint_calculator.binning = binning self._spoint_calculator.binning_mode = "integrate" md["binning"] = list(binning) md["binning_mode"] = "integrate" md["nbins"] = len(binning) - 1 elif sampling is not None: self._spoint_calculator.binning = sampling md["binning"] = list(sampling) self._spoint_calculator.binning_mode = "sample" md["binning_mode"] = "sample" md["nbins"] = len(sampling) md["xvar"] = xvar md["yvar"] = yvar self._spoint_calculator.normalize = normalize self._spoint_calculator.kwargs = kwargs
[docs] def set_spoints_grid(self, values: Dict[str, Iterable[float]]) -> None: """Set a grid of points in sampling space. Args: values: A dictionary of the following form: .. code-block:: python { <coeff name>: [ value_1, ..., value_n ] } where ``value_1``, ..., ``value_n`` can be complex numbers in general. """ # IMPORTANT to keep this order! self._coeffs = sorted(list(values.keys())) # Now we collect all lists of values. values_lists = [values[coeff] for coeff in self._coeffs] # Now we build the cartesian product, i.e. # [a1, a2, ...] x [b1, b2, ...] x ... x [z1, z2, ...] = # [(a1, b1, ..., z1), ..., (a2, b2, ..., z2)] self._spoints = np.array(list(itertools.product(*values_lists))) self.md["spoints"]["grid"] = failsafe_serialize(values)
[docs] def set_spoints_equidist(self, ranges: Dict[str, tuple]) -> None: """Set a list of 'equidistant' points in sampling space. Args: ranges: A dictionary of the following form: .. code-block:: python { <coeff name>: ( <Minimum of coeff>, <Maximum of coeff>, <Number of bins between min and max>, ) } .. note:: In order to add imaginary parts to your coefficients, prepend their name with ``im_`` (you can customize this prefix by setting the :attr:`.imaginary_prefix` attribute to a custom value.) Example: .. code-block:: python s = Scanner() s.set_spoints_equidist( { "a": (-2, 2, 4), "im_a": (-1, 1, 10), }, ... ) Will sample the real part of ``a`` in 4 points between -2 and 2 and the imaginary part of ``a`` in 10 points between -1 and 1. Returns: None """ # Because of our hack with the imaginary prefix, let's first see which # coefficients we really have def is_imaginary(name: str) -> bool: return name.startswith(self.imaginary_prefix) def real_part(name: str) -> str: if is_imaginary(name): return name.replace(self.imaginary_prefix, "", 1) else: return name def imaginary_part(name: str) -> str: if not is_imaginary(name): return self.imaginary_prefix + name else: return name coeffs = list(set(map(real_part, ranges.keys()))) grid_config = {} for coeff in coeffs: # Now let's always collect the values of the real part and of the # imaginary part res = [0.0] ims = [0.0] is_complex = False if real_part(coeff) in ranges: res = list(np.linspace(*ranges[real_part(coeff)])) if imaginary_part(coeff) in ranges: ims = list(np.linspace(*ranges[imaginary_part(coeff)])) is_complex = True # And basically take their cartesian product, alias initialize # the complex number. if is_complex: grid_config[coeff] = [complex(x, y) for x in res for y in ims] else: grid_config[coeff] = res self.set_spoints_grid(grid_config) # Make sure to do this after set_spoints_grid, so we overwrite # the relevant parts. md = self.md["spoints"] md["sampling"] = "equidistant" md["ranges"] = ranges
# todo: Apply to only one dimension?
[docs] def add_spoints_noise(self, generator="gauss", **kwargs) -> None: """Add noise to existing sample points. Args: generator: Random number generator. Default is ``gauss``. Currently supported: ``gauss``. **kwargs: Additional keywords to configure the generator. These keywords are as follows (value assignments are the default values): ``gauss``: ``mean = 0``, ``sigma = 1`` """ if self.spoints is None: raise ValueError( "This method can only be applied after spoints" " have been set." ) if generator == "gauss": gauss_kwargs = {"mean": 0.0, "sigma": 1.0} gauss_kwargs.update(kwargs) rand = np.random.normal( loc=gauss_kwargs["mean"], scale=gauss_kwargs["sigma"], size=self.spoints.shape, ) else: raise ValueError("Unknown generator {}.".format(generator)) if "noise" not in self.md: self.md["noise"] = [] self.md["noise"].append({"generator": generator, "kwargs": kwargs}) self._spoints += rand
[docs] def set_no_workers(self, no_workers: int) -> None: """Set the number of worker processes to be used. This will usually translate to the number of CPUs being used. Args: no_workers: Number of worker processes Returns: ``None`` """ self._no_workers = no_workers
[docs] def set_imaginary_prefix(self, value: str) -> None: """Set prefix to be used for imaginary parameters in :meth:`set_spoints_grid` and :meth:`set_spoints_equidist`. Args: value: Prefix string Returns: ``None`` """ self.md["imaginary_prefix"] = value
# ************************************************************************** # Run # **************************************************************************
[docs] def run(self, data: Data) -> Optional["ScannerResult"]: """Calculate all sample points and writes the result to a dataframe. Args: data: Data object. Returns: :class:`ScannerResult` or None .. warning:: The function set in :meth:`set_dfunction` has to be a globally defined function in order to do multiprocessing, else you will probably run into the error ``Can't pickle local object ...`` that is issued by the python multiprocessing module. If you run into any problems like this, you can always run in single core mode by specifying ``no_workes=1``. """ # todo: rather raise exceptions? if not self._spoints.any(): self.log.error( "No sample points specified. Returning without doing " "anything." ) return if not self._spoint_calculator: self.log.error( "No function specified. Please set it " "using ``Scanner.set_dfunction``. Returning without doing " "anything." ) return no_workers = self._no_workers if not self._no_workers: no_workers = os.cpu_count() if not no_workers: # os.cpu_count() didn't work self.log.warn( "os.cpu_count() not determine number of cores. Fallling " "back to single core mode." ) no_workers = 1 start_time = time.time() if no_workers >= 2: rows = self._run_multicore(no_workers) else: rows = self._run_singlecore() end_time = time.time() run_time = end_time - start_time self.md["run_time"] = run_time return ScannerResult( data=data, rows=rows, spoints=self._spoints, md=self.md, coeffs=self._coeffs, )
# todo: shouldn't this rather return numpy arrays than List2 def _run_multicore(self, no_workers: int) -> List[List[float]]: """Calculate spoints in parallel processing mode. Args: no_workers: Number of workers. Returns: Rows of the dataframe. """ # pool of worker nodes pool = multiprocessing.Pool(processes=no_workers) # this is the worker function. worker = self._spoint_calculator.calc results = pool.imap(worker, self._spoints) # close the queue for new jobs pool.close() self.log.info( "Started queue with {} job(s) distributed over up to {} " "core(s)/worker(s).".format(len(self._spoints), no_workers) ) rows = [] if self._progress_bar: tqdm_kwargs = dict( desc="Scanning: ", unit=" spoint", total=len(self._spoints) ) tqdm_kwargs.update(self._tqdm_kwargs) iterator = tqdm.auto.tqdm(enumerate(results), **tqdm_kwargs) else: iterator = enumerate(results) for index, result in iterator: md = self.md["dfunction"] if not isinstance(result, Iterable): result = [result] if "nbins" not in md: md["nbins"] = len(result) rows.append([*self._spoints[index], *result]) # Wait for completion of all jobs here pool.join() return rows # todo: shouldn't this rather return numpy arrays than List2 def _run_singlecore(self) -> List[List[float]]: """Calculate spoints in single core processing mode. This is sometimes useful because multiprocessing has its quirks. Returns: Rows of the dataframe. """ self.log.info( "Started queue with {} job(s) in single core mode.".format( len(self._spoints) ) ) rows = [] for index, spoint in tqdm.auto.tqdm( enumerate(self._spoints), desc="Scanning: ", unit=" spoint", total=len(self._spoints), ): result = self._spoint_calculator.calc(spoint) md = self.md["dfunction"] if not isinstance(result, Iterable): result = [result] if "nbins" not in md: md["nbins"] = len(result) rows.append([*self._spoints[index], *result]) return rows
[docs]class ScannerResult(DataResult):
[docs] def __init__( self, data: Data, rows: List[List[float]], spoints, md, coeffs ): super().__init__(data=data) self._rows = rows self._spoints = spoints self.md = md # type: nested_dict self._coeffs = coeffs
# ************************************************************************** # Convenience properties # ************************************************************************** @property def imaginary_prefix(self) -> str: """Prefix for the name of imaginary parts of coefficients. Also see e.g. :meth:`.set_spoints_equidist`. Read only. """ return self.md["imaginary_prefix"] @property def spoints(self): """Points in parameter space that are sampled (read-only).""" return self._spoints @property def coeffs(self): """The name of the parameters/coefficients/dimensions of the spoints (read only). Set after spoints are set. Does **not** include the names of the columns of the imaginary parts. """ return self._coeffs.copy() # ************************************************************************** # Write # **************************************************************************
[docs] def write(self) -> None: self.log.debug("Converting data to pandas dataframe.") cols = self.coeffs cols.extend( [ "bin{}".format(no_bin) for no_bin in range(self.md["dfunction"]["nbins"]) ] ) # Now we finally write everything to data self._data.df = pd.DataFrame(data=self._rows, columns=cols) # todo: Shouldn't we do that above already? This sounds not so # great performance wise... # Special handling for complex numbers coeffs_with_im = [] for coeff in self.coeffs: coeffs_with_im.append(coeff) if not list(self._data.df[coeff].apply(np.imag).unique()) == [0.0]: values = self._data.df[coeff] self._data.df[coeff] = values.apply(np.real) loc = list(self._data.df.columns).index(coeff) self._data.df.insert( loc + 1, self.imaginary_prefix + coeff, values.apply(np.imag), ) coeffs_with_im.append(self.imaginary_prefix + coeff) else: self._data.df[coeff] = self._data.df[coeff].apply(np.real) self._data.df.index.name = "index" # fixme: Should already be set in worker class self.md["spoints"]["coeffs"] = coeffs_with_im self._data.md["scan"] = self.md self.log.info("Integration done.")