diff --git a/hyppopy/Solver/GridsearchSolver.py b/hyppopy/Solver/GridsearchSolver.py index d80d2d7..a14a5f7 100644 --- a/hyppopy/Solver/GridsearchSolver.py +++ b/hyppopy/Solver/GridsearchSolver.py @@ -1,220 +1,223 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import copy import logging import datetime import numpy as np +from pprint import pformat from hyperopt import Trials from scipy.stats import norm from itertools import product from hyppopy.globals import DEBUGLEVEL from .HyppopySolver import HyppopySolver from ..BlackboxFunction import BlackboxFunction LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) def get_uniform_axis_sample(a, b, N, dtype): """ returns a uniform sample x(n) in the range [a,b] sampled at N pojnts :param a: left value range bound :param b: right value range bound :param N: discretization of intervall [a,b] :param dtype: data type :return: [list] axis range """ assert a < b, "condition a < b violated!" assert isinstance(N, int), "condition N of type int violated!" assert isinstance(dtype, str), "condition type of type str violated!" if dtype == "int": return list(np.linspace(a, b, N).astype(int)) elif dtype == "float" or dtype == "double": return list(np.linspace(a, b, N)) else: raise AssertionError("dtype {} not supported for uniform sampling!".format(dtype)) def get_norm_cdf(N): """ returns a normed gaussian cdf (range [0,1]) with N sampling points :param N: sampling points :return: [ndarray] gaussian cdf function values """ assert isinstance(N, int), "condition N of type int violated!" even = True if N % 2 != 0: N -= 1 even = False N = int(N/2) sigma = 1/3 x = np.linspace(0, 1, N) y1 = norm.cdf(x, loc=0, scale=sigma)-0.5 if not even: y1 = np.append(y1, [0.5]) y2 = 1-(norm.cdf(x, loc=0, scale=sigma)-0.5) y2 = np.flip(y2, axis=0) y = np.concatenate((y1, y2), axis=0) return y def get_gaussian_axis_sample(a, b, N, dtype): """ returns a function value f(n) where f is a gaussian cdf in range [a, b] and N sampling points :param a: left value range bound :param b: right value range bound :param N: discretization of intervall [a,b] :param dtype: data type :return: [list] axis range """ assert a < b, "condition a < b violated!" assert isinstance(N, int), "condition N of type int violated!" assert isinstance(dtype, str), "condition type of type str violated!" data = [] for n in range(N): x = a + get_norm_cdf(N)[n]*(b-a) if dtype == "int": data.append(int(x)) elif dtype == "float" or dtype == "double": data.append(x) else: raise AssertionError("dtype {} not supported for uniform sampling!".format(dtype)) return data def get_logarithmic_axis_sample(a, b, N, dtype): """ returns a function value f(n) where f is logarithmic function e^x sampling the exponent range [log(a), log(b)] linear at N sampling points. The function values returned are in the range [a, b]. :param a: left value range bound :param b: right value range bound :param N: discretization of intervall [a,b] :param dtype: data type :return: [list] axis range """ assert a < b, "condition a < b violated!" assert a > 0, "condition a > 0 violated!" assert isinstance(N, int), "condition N of type int violated!" assert isinstance(dtype, str), "condition type of type str violated!" # convert input range into exponent range lexp = np.log(a) rexp = np.log(b) exp_range = np.linspace(lexp, rexp, N) data = [] for n in range(exp_range.shape[0]): x = np.exp(exp_range[n]) if dtype == "int": data.append(int(x)) elif dtype == "float" or dtype == "double": data.append(x) else: raise AssertionError("dtype {} not supported for uniform sampling!".format(dtype)) return data class GridsearchSolver(HyppopySolver): """ The GridsearchSolver class implements a gridsearch optimization. The gridsearch supports categorical, uniform, normal and loguniform sampling. To use the GridsearchSolver, besides a range, one must specifiy the number of samples in the domain, e.g. 'data': [0, 1, 100] """ def __init__(self, project=None): HyppopySolver.__init__(self, project) self._tid = None + self._has_maxiteration_field = False def loss_function(self, params): loss = None vals = {} idx = {} for key, value in params.items(): vals[key] = [value] idx[key] = [self._tid] trial = {'tid': self._tid, 'result': {'loss': None, 'status': 'ok'}, 'misc': { 'tid': self._tid, 'idxs': idx, 'vals': vals }, 'book_time': datetime.datetime.now(), 'refresh_time': None } try: loss = self.blackbox(**params) if loss is None: trial['result']['loss'] = np.nan trial['result']['status'] = 'failed' else: trial['result']['loss'] = loss except Exception as e: LOG.error("execution of self.blackbox(**params) failed due to:\n {}".format(e)) trial['result']['loss'] = np.nan trial['result']['status'] = 'failed' trial['refresh_time'] = datetime.datetime.now() self._trials.trials.append(trial) if isinstance(self.blackbox, BlackboxFunction) and self.blackbox.callback_func is not None: cbd = copy.deepcopy(params) cbd['iterations'] = self._tid + 1 cbd['loss'] = loss cbd['status'] = trial['result']['status'] self.blackbox.callback_func(**cbd) return def execute_solver(self, searchspace): self._tid = 0 self._trials = Trials() for x in product(*searchspace[1]): params = {} for name, value in zip(searchspace[0], x): params[name] = value try: self.loss_function(params) self._tid += 1 except Exception as e: msg = "internal error in randomsearch execute_solver occured. {}".format(e) LOG.error(msg) raise BrokenPipeError(msg) self.best = self._trials.argmin def convert_searchspace(self, hyperparameter): """ the function converts the standard parameter input into a range list depending on the domain. These rangelists are later used with itertools product to create a paramater space sample of each combination. :param hyperparameter: [dict] hyperparameter space :return: [list] name and range for each parameter space axis """ + LOG.debug("convert input parameter\n\n\t{}\n".format(pformat(hyperparameter))) searchspace = [[], []] for name, param in hyperparameter.items(): if param["domain"] == "categorical": searchspace[0].append(name) searchspace[1].append(param["data"]) elif param["domain"] == "uniform": searchspace[0].append(name) searchspace[1].append(get_uniform_axis_sample(param["data"][0], param["data"][1], param["data"][2], param["type"])) elif param["domain"] == "normal": searchspace[0].append(name) searchspace[1].append(get_gaussian_axis_sample(param["data"][0], param["data"][1], param["data"][2], param["type"])) elif param["domain"] == "loguniform": searchspace[0].append(name) searchspace[1].append(get_logarithmic_axis_sample(param["data"][0], param["data"][1], param["data"][2], param["type"])) return searchspace diff --git a/hyppopy/Solver/HyppopySolver.py b/hyppopy/Solver/HyppopySolver.py index fb027dd..1c35b41 100644 --- a/hyppopy/Solver/HyppopySolver.py +++ b/hyppopy/Solver/HyppopySolver.py @@ -1,221 +1,223 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import abc import os import types import logging import datetime import numpy as np import pandas as pd from ..globals import DEBUGLEVEL from ..HyppopyProject import HyppopyProject from ..BlackboxFunction import BlackboxFunction from ..VirtualFunction import VirtualFunction from hyppopy.globals import DEBUGLEVEL, DEFAULTITERATIONS LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) class HyppopySolver(object): def __init__(self, project=None): self._best = None self._trials = None self._blackbox = None self._max_iterations = None self._project = project self._total_duration = None self._solver_overhead = None self._time_per_iteration = None self._accumulated_blackbox_time = None + self._has_maxiteration_field = True @abc.abstractmethod def execute_solver(self, searchspace): raise NotImplementedError('users must define execute_solver to use this class') @abc.abstractmethod def convert_searchspace(self, hyperparameter): raise NotImplementedError('users must define convert_searchspace to use this class') def run(self, print_stats=True): - if 'solver_max_iterations' not in self.project.__dict__: - msg = "Missing max_iteration entry in project, use default {}!".format(DEFAULTITERATIONS) - LOG.warning(msg) - print("WARNING: {}".format(msg)) - setattr(self.project, 'solver_max_iterations', DEFAULTITERATIONS) - self._max_iterations = self.project.solver_max_iterations + if self._has_maxiteration_field: + if 'solver_max_iterations' not in self.project.__dict__: + msg = "Missing max_iteration entry in project, use default {}!".format(DEFAULTITERATIONS) + LOG.warning(msg) + print("WARNING: {}".format(msg)) + setattr(self.project, 'solver_max_iterations', DEFAULTITERATIONS) + self._max_iterations = self.project.solver_max_iterations start_time = datetime.datetime.now() try: self.execute_solver(self.convert_searchspace(self.project.hyperparameter)) except Exception as e: raise e end_time = datetime.datetime.now() dt = end_time - start_time days = divmod(dt.total_seconds(), 86400) hours = divmod(days[1], 3600) minutes = divmod(hours[1], 60) seconds = divmod(minutes[1], 1) milliseconds = divmod(seconds[1], 0.001) self._total_duration = [int(days[0]), int(hours[0]), int(minutes[0]), int(seconds[0]), int(milliseconds[0])] if print_stats: self.print_best() self.print_timestats() def get_results(self): results = {'duration': [], 'losses': []} pset = self.trials.trials[0]['misc']['vals'] for p in pset.keys(): results[p] = [] for n, trial in enumerate(self.trials.trials): t1 = trial['book_time'] t2 = trial['refresh_time'] results['duration'].append((t2 - t1).microseconds / 1000.0) results['losses'].append(trial['result']['loss']) losses = np.array(results['losses']) results['losses'] = list(losses) pset = trial['misc']['vals'] for p in pset.items(): results[p[0]].append(p[1][0]) return pd.DataFrame.from_dict(results), self.best def print_best(self): print("\n") print("#" * 40) print("### Best Parameter Choice ###") print("#" * 40) for name, value in self.best.items(): print(" - {}\t:\t{}".format(name, value)) print("\n - number of iterations\t:\t{}".format(self.trials.trials[-1]['tid']+1)) print(" - total time\t:\t{}d:{}h:{}m:{}s:{}ms".format(self._total_duration[0], self._total_duration[1], self._total_duration[2], self._total_duration[3], self._total_duration[4])) print("#" * 40) def compute_time_statistics(self): dts = [] for trial in self._trials.trials: if 'book_time' in trial.keys() and 'refresh_time' in trial.keys(): dt = trial['refresh_time'] - trial['book_time'] dts.append(dt.total_seconds()) self._time_per_iteration = np.mean(dts) * 1e3 self._accumulated_blackbox_time = np.sum(dts) * 1e3 tmp = self.total_duration - self._accumulated_blackbox_time self._solver_overhead = int(np.round(100.0 / self.total_duration * tmp)) def print_timestats(self): print("\n") print("#" * 40) print("### Timing Statistics ###") print("#" * 40) print(" - per iteration: {}ms".format(int(self.time_per_iteration*1e4)/10000)) print(" - total time: {}d:{}h:{}m:{}s:{}ms".format(self._total_duration[0], self._total_duration[1], self._total_duration[2], self._total_duration[3], self._total_duration[4])) print(" - solver overhead: {}%".format(self.solver_overhead)) print("#" * 40) @property def project(self): return self._project @project.setter def project(self, value): if not isinstance(value, HyppopyProject): msg = "Input error, project_manager of type: {} not allowed!".format(type(value)) LOG.error(msg) raise IOError(msg) self._project = value @property def blackbox(self): return self._blackbox @blackbox.setter def blackbox(self, value): if isinstance(value, types.FunctionType) or isinstance(value, BlackboxFunction) or isinstance(value, VirtualFunction): self._blackbox = value else: self._blackbox = None msg = "Input error, blackbox of type: {} not allowed!".format(type(value)) LOG.error(msg) raise IOError(msg) @property def best(self): return self._best @best.setter def best(self, value): if not isinstance(value, dict): msg = "Input error, best of type: {} not allowed!".format(type(value)) LOG.error(msg) raise IOError(msg) self._best = value @property def trials(self): return self._trials @trials.setter def trials(self, value): self._trials = value @property def max_iterations(self): return self._max_iterations @max_iterations.setter def max_iterations(self, value): if not isinstance(value, int): msg = "Input error, max_iterations of type: {} not allowed!".format(type(value)) LOG.error(msg) raise IOError(msg) if value < 1: msg = "Precondition violation, max_iterations < 1!" LOG.error(msg) raise IOError(msg) self._max_iterations = value @property def total_duration(self): return (self._total_duration[0] * 86400 + self._total_duration[1] * 3600 + self._total_duration[2] * 60 + self._total_duration[3]) * 1000 + self._total_duration[4] @property def solver_overhead(self): if self._solver_overhead is None: self.compute_time_statistics() return self._solver_overhead @property def time_per_iteration(self): if self._time_per_iteration is None: self.compute_time_statistics() return self._time_per_iteration @property def accumulated_blackbox_time(self): if self._accumulated_blackbox_time is None: self.compute_time_statistics() return self._accumulated_blackbox_time diff --git a/hyppopy/Solver/RandomsearchSolver.py b/hyppopy/Solver/RandomsearchSolver.py index 1d6a5f3..5cbde78 100644 --- a/hyppopy/Solver/RandomsearchSolver.py +++ b/hyppopy/Solver/RandomsearchSolver.py @@ -1,153 +1,198 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import copy import random import logging import datetime import numpy as np from pprint import pformat from hyperopt import Trials from hyppopy.globals import DEBUGLEVEL from .HyppopySolver import HyppopySolver from ..BlackboxFunction import BlackboxFunction LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) def draw_uniform_sample(param): - assert param['type'] != 'str', "Cannot sample a string list uniformly!" - assert param['data'][0] < param['data'][1], "Precondition violation: data[0] > data[1]!" + """ + function draws a random sample from a uniform range + :param param: [dict] input hyperparameter discription + :return: random sample value of type data['type'] + """ + assert param['type'] != 'str', "cannot sample a string list!" + assert param['data'][0] < param['data'][1], "precondition violation: data[0] > data[1]!" s = random.random() s *= np.abs(param['data'][1] - param['data'][0]) s += param['data'][0] if param['type'] == 'int': s = int(np.round(s)) if s < param['data'][0]: s = int(param['data'][0]) if s > param['data'][1]: s = int(param['data'][1]) return s def draw_normal_sample(param): + """ + function draws a random sample from a normal distributed range + :param param: [dict] input hyperparameter discription + :return: random sample value of type data['type'] + """ + assert param['type'] != 'str', "cannot sample a string list!" + assert param['data'][0] < param['data'][1], "precondition violation: data[0] > data[1]!" mu = (param['data'][1] - param['data'][0]) / 2 sigma = mu / 3 s = np.random.normal(loc=param['data'][0] + mu, scale=sigma) if s > param['data'][1]: s = param['data'][1] if s < param['data'][0]: s = param['data'][0] + s = float(s) + if param["type"] == "int": + s = int(np.round(s)) return s def draw_loguniform_sample(param): + """ + function draws a random sample from a logarithmic distributed range + :param param: [dict] input hyperparameter discription + :return: random sample value of type data['type'] + """ + assert param['type'] != 'str', "cannot sample a string list!" + assert param['data'][0] < param['data'][1], "precondition violation: data[0] > data[1]!" p = copy.deepcopy(param) p['data'][0] = np.log(param['data'][0]) p['data'][1] = np.log(param['data'][1]) assert p['data'][0] is not np.nan, "Precondition violation, left bound input error, results in nan!" assert p['data'][1] is not np.nan, "Precondition violation, right bound input error, results in nan!" x = draw_uniform_sample(p) s = np.exp(x) if s > param['data'][1]: s = param['data'][1] if s < param['data'][0]: s = param['data'][0] return s def draw_categorical_sample(param): + """ + function draws a random sample from a categorical list + :param param: [dict] input hyperparameter discription + :return: random sample value of type data['type'] + """ return random.sample(param['data'], 1)[0] def draw_sample(param): + """ + function draws a sample from the input hyperparameter descriptor depending on it's domain + :param param: [dict] input hyperparameter discription + :return: random sample value of type data['type'] + """ + assert isinstance(param, dict), "input error, hyperparam descriptors of type {} not allowed!".format(type(param)) + assert 'domain' in param.keys(), "input error, hyperparam descriptors need a domain key!" + assert 'data' in param.keys(), "input error, hyperparam descriptors need a data key!" + assert 'type' in param.keys(), "input error, hyperparam descriptors need a type key!" if param['domain'] == "uniform": return draw_uniform_sample(param) elif param['domain'] == "normal": return draw_normal_sample(param) elif param['domain'] == "loguniform": return draw_loguniform_sample(param) elif param['domain'] == "categorical": return draw_categorical_sample(param) else: raise LookupError("Unknown domain {}".format(param['domain'])) class RandomsearchSolver(HyppopySolver): - + """ + The RandomsearchSolver class implements a randomsearch optimization. The randomsearch supports + categorical, uniform, normal and loguniform sampling. The solver draws an independent sample + from the parameter space each iteration.""" def __init__(self, project=None): HyppopySolver.__init__(self, project) self._tid = None def loss_function(self, params): loss = None vals = {} idx = {} for key, value in params.items(): vals[key] = [value] idx[key] = [self._tid] trial = {'tid': self._tid, 'result': {'loss': None, 'status': 'ok'}, 'misc': { 'tid': self._tid, 'idxs': idx, 'vals': vals }, 'book_time': datetime.datetime.now(), 'refresh_time': None } try: loss = self.blackbox(**params) if loss is None: trial['result']['loss'] = np.nan trial['result']['status'] = 'failed' else: trial['result']['loss'] = loss except Exception as e: LOG.error("execution of self.blackbox(**params) failed due to:\n {}".format(e)) trial['result']['loss'] = np.nan trial['result']['status'] = 'failed' trial['refresh_time'] = datetime.datetime.now() self._trials.trials.append(trial) if isinstance(self.blackbox, BlackboxFunction) and self.blackbox.callback_func is not None: cbd = copy.deepcopy(params) cbd['iterations'] = self._tid + 1 cbd['loss'] = loss cbd['status'] = trial['result']['status'] self.blackbox.callback_func(**cbd) return def execute_solver(self, searchspace): self._tid = 0 self._trials = Trials() N = self.max_iterations try: for n in range(N): params = {} for name, p in searchspace.items(): params[name] = draw_sample(p) self.loss_function(params) self._tid += 1 except Exception as e: msg = "internal error in randomsearch execute_solver occured. {}".format(e) LOG.error(msg) raise BrokenPipeError(msg) self.best = self._trials.argmin def convert_searchspace(self, hyperparameter): + """ + this function simply pipes the input parameter through, the sample + drawing functions are responsible for interpreting the parameter. + :param hyperparameter: [dict] hyperparameter space + :return: [dict] hyperparameter space + """ LOG.debug("convert input parameter\n\n\t{}\n".format(pformat(hyperparameter))) return hyperparameter diff --git a/hyppopy/tests/test_randomsearchsolver.py b/hyppopy/tests/test_randomsearchsolver.py new file mode 100644 index 0000000..8ef60f8 --- /dev/null +++ b/hyppopy/tests/test_randomsearchsolver.py @@ -0,0 +1,134 @@ +# DKFZ +# +# +# Copyright (c) German Cancer Research Center, +# Division of Medical and Biological Informatics. +# All rights reserved. +# +# This software is distributed WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# See LICENSE.txt or http://www.mitk.org for details. +# +# Author: Sven Wanner (s.wanner@dkfz.de) + +import unittest +import matplotlib.pylab as plt + +from ..solver.RandomsearchSolver import * +from ..VirtualFunction import VirtualFunction +from hyppopy.HyppopyProject import HyppopyProject + + +class RandomsearchTestSuite(unittest.TestCase): + + def setUp(self): + pass + + def test_draw_uniform_sample(self): + param = {"data": [0, 1, 10], + "type": "float"} + values = [] + for i in range(10000): + values.append(draw_uniform_sample(param)) + self.assertTrue(0 <= values[-1] <= 1) + self.assertTrue(isinstance(values[-1], float)) + hist = plt.hist(values, bins=10, normed=True) + std = np.std(hist[0]) + mean = np.mean(hist[0]) + self.assertTrue(std < 0.05) + self.assertTrue(0.9 < mean < 1.1) + + param = {"data": [0, 10, 11], + "type": "int"} + values = [] + for i in range(10000): + values.append(draw_uniform_sample(param)) + self.assertTrue(0 <= values[-1] <= 10) + self.assertTrue(isinstance(values[-1], int)) + hist = plt.hist(values, bins=11, normed=True) + std = np.std(hist[0]) + mean = np.mean(hist[0]) + self.assertTrue(std < 0.05) + self.assertTrue(0.09 < mean < 0.11) + + def test_draw_normal_sample(self): + param = {"data": [0, 10, 11], + "type": "int"} + values = [] + for i in range(10000): + values.append(draw_normal_sample(param)) + self.assertTrue(0 <= values[-1] <= 10) + self.assertTrue(isinstance(values[-1], int)) + hist = plt.hist(values, bins=11, normed=True) + for i in range(1, 5): + self.assertTrue(hist[0][i-1]-hist[0][i] < 0) + for i in range(5, 10): + self.assertTrue(hist[0][i] - hist[0][i+1] > 0) + + def test_draw_loguniform_sample(self): + param = {"data": [1, 1000, 11], + "type": "float"} + values = [] + for i in range(10000): + values.append(draw_loguniform_sample(param)) + self.assertTrue(1 <= values[-1] <= 1000) + self.assertTrue(isinstance(values[-1], float)) + hist = plt.hist(values, bins=11, normed=True) + for i in range(10): + self.assertTrue(hist[0][i] > hist[0][i+1]) + + def test_draw_categorical_sample(self): + param = {"data": [1, 2, 3], + "type": int} + values = [] + for i in range(10000): + values.append(draw_categorical_sample(param)) + self.assertTrue(values[-1] == 1 or values[-1] == 2 or values[-1] == 3) + self.assertTrue(isinstance(values[-1], int)) + hist = plt.hist(values, bins=3, normed=True) + for i in range(3): + self.assertTrue(0.45 < hist[0][i] < 0.55) + + def test_solver_complete(self): + config = { + "hyperparameter": { + "axis_00": { + "domain": "normal", + "data": [300, 800, 11], + "type": "float" + }, + "axis_01": { + "domain": "normal", + "data": [-1, 1, 11], + "type": "float" + }, + "axis_02": { + "domain": "uniform", + "data": [0, 10, 11], + "type": "float" + } + }, + "settings": { + "solver": {"max_iterations": 5000}, + "custom": {} + }} + + project = HyppopyProject(config) + solver = RandomsearchSolver(project) + vfunc = VirtualFunction() + vfunc.load_default() + solver.blackbox = vfunc + solver.run(print_stats=False) + df, best = solver.get_results() + print("best['axis_00']={}".format(best['axis_00'])) + print("best['axis_01']={}".format(best['axis_01'])) + print("best['axis_02']={}".format(best['axis_02'])) + self.assertTrue(570 < best['axis_00'] < 590) + self.assertTrue(0.1 < best['axis_01'] < 0.8) + self.assertTrue(4.5 < best['axis_02'] < 6) + + +if __name__ == '__main__': + unittest.main()