diff --git a/examples/performance_test.py b/examples/performance_test.py deleted file mode 100644 index 2f5c523..0000000 --- a/examples/performance_test.py +++ /dev/null @@ -1,583 +0,0 @@ -# DKFZ -# -# -# Copyright (c) German Cancer Research Center, -# Division of Medical and Biological Informatics. -# All rights reserved. -# -# This software is distributed WITHOUT ANY WARRANTY; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. -# -# See LICENSE.txt or http://www.mitk.org for details. -# -# Author: Sven Wanner (s.wanner@dkfz.de) - -import os -import sys -import shutil -import argparse -import tempfile -import numpy as np -import pandas as pd -import seaborn as sns -from sklearn.svm import SVC -import matplotlib.pyplot as plt -from sklearn.metrics import accuracy_score -from sklearn.ensemble import AdaBoostClassifier -from sklearn.datasets import load_breast_cancer -from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -from sklearn.ensemble import GradientBoostingClassifier - -from hyppopy.projectmanager import ProjectManager -from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase -from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase -from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase -from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase -from hyppopy.workflows.gradientboost_usecase.gradientboost_usecase import gradientboost_usecase - -sns.set(style="ticks") -sns.set(style="darkgrid") - - -class PerformanceTest(object): - - def __init__(self, root=None, size='small', plugin='hyperopt'): - if root is None: - self.root = os.path.join(tempfile.gettempdir(), 'test_data') - else: - self.root = os.path.join(root, 'test_data') - if not os.path.isdir(self.root): - os.makedirs(self.root) - self.test = None - self.train = None - self.config = None - self.size = size - self.plugin = plugin - if plugin == 'gridsearch': - print("Gridsearch is not available for this performance test!") - sys.exit() - if size == 'small': - self.iter_sequence = [25] - elif size == 'medium': - self.iter_sequence = [50, 150, 500] - elif size == 'big': - self.iter_sequence = [5, 10, 25, 50, 100, 150, 300, 500, 800, 1200] - - - def run(self, usecase='all'): - self.set_up() - print("") - print("#" * 40) - print("# Hyppopy performance test\n# usecase={}\n# size={}\n# plugin={}".format(usecase, self.size, self.plugin)) - print("#" * 40) - # if usecase == 'svc' or usecase == 'all': - # self.run_svc_usecase() - if usecase == 'knc' or usecase == 'all': - self.run_knc_usecase() - if usecase == 'randomforest' or usecase == 'all': - self.run_randomforest_usecase() - if usecase == 'gradientboost' or usecase == 'all': - self.run_gradientboost_usecase() - if usecase == 'adaboost' or usecase == 'all': - self.run_adaboost_usecase() - - def set_hyperparameter(self, params): - self.config["hyperparameter"] = params - - def set_output_dir(self, dirname): - self.config["settings"]["solver_plugin"]["output_dir"] = os.path.join(self.root, dirname) - - def set_iterations(self, value): - self.config["settings"]["solver_plugin"]["max_iterations"] = value - - def find_loss_and_time(self, solver_output, results): - min_idx = solver_output["losses"].idxmin() - results["losses"].append(solver_output["losses"][min_idx]) - results["duration"].append(solver_output["duration"][min_idx]) - - def plot_matrix(self, df, name=""): - sns_plot = sns.pairplot(df, height=1.8, aspect=1.8) - - fig = sns_plot.fig - fig.subplots_adjust(top=0.93, wspace=0.3) - t = fig.suptitle(name, fontsize=14) - plt.show() - return sns_plot - - def plot(self, df, x, y, name="", save=None, show=True): - fig, axs = plt.subplots(nrows=len(y), ncols=len(x), figsize=(12.0, len(y)*3)) - fig.subplots_adjust(left=0.08, right=0.98, wspace=0.3) - - argmin = df["losses"].idxmin() - - for nx, _x in enumerate(x): - for ny, _y in enumerate(y): - ax = axs[ny, nx] - ax.plot(df[_x].values, df[_y].values, 'o') - ax.plot(df[_x].values[argmin], df[_y].values[argmin], 'ro') - ax.grid(True) - if nx == 0: - ax.set_ylabel(_y) - if ny == len(y)-1: - ax.set_xlabel(_x) - fig.suptitle(name, fontsize=16) - if save is not None: - if not os.path.isdir(os.path.dirname(save)): - os.makedirs(os.path.dirname(save)) - plt.savefig(save) - if show: - plt.show() - - - def set_up(self): - breast_cancer_data = load_breast_cancer() - x = breast_cancer_data.data - y = breast_cancer_data.target - x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23) - - x_train_fname = os.path.join(self.root, 'x_train.npy') - y_train_fname = os.path.join(self.root, 'y_train.npy') - if not os.path.isfile(x_train_fname): - np.save(x_train_fname, x_train) - if not os.path.isfile(y_train_fname): - np.save(y_train_fname, y_train) - - self.train = [x_train, y_train] - self.test = [x_test, y_test] - self.config = { - "hyperparameter": {}, - "settings": { - "solver_plugin": { - "max_iterations": 1, - "use_plugin": self.plugin, - "output_dir": os.path.join(self.root, 'test_results') - }, - "custom": { - "data_path": self.root, - "data_name": "x_train.npy", - "labels_name": "y_train.npy" - } - }} - - def run_svc_usecase(self): - print("\n") - print("*" * 30) - print("SVC Classifier") - print("*" * 30) - print("\n") - hp = { - "C": { - "domain": "uniform", - "data": [0.0001, 300.0], - "type": "float" - }, - "kernel": { - "domain": "categorical", - "data": ["linear", "poly", "rbf"], - "type": "str" - } - } - - self.set_hyperparameter(hp) - self.set_output_dir("svc_usecase") - - results = {"iterations": [], "C": [], "kernel": [], "accuracy": [], "losses": [], "duration": []} - status = True - for n in self.iter_sequence: - try: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = svc_usecase() - uc.run(save=True) - res, best = uc.get_results() - clf = SVC(C=best['n_estimators'], - kernel=hp['kernel']['data'][best['kernel']]) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['kernel'].append(best['kernel']) - results['C'].append(best['C']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - - except Exception as e: - print("Failed at iteration step {}, reason: {}".format(n, e)) - status = False - - if status: - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'max_features'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) - - def run_randomforest_usecase(self): - print("\n") - print("*" * 30) - print("RandomForest Classifier") - print("*" * 30) - print("\n") - hp = { - "n_estimators": { - "domain": "uniform", - "data": [3, 500], - "type": "int" - }, - "max_depth": { - "domain": "uniform", - "data": [1, 50], - "type": "int" - }, - "max_features": { - "domain": "categorical", - "data": ["auto", "sqrt", "log2"], - "type": "str" - } - } - - self.set_hyperparameter(hp) - self.set_output_dir("randomforest_usecase") - - results = {"iterations": [], "n_estimators": [], "max_depth": [], "max_features": [], "accuracy": [], "losses": [], "duration": []} - status = True - for n in self.iter_sequence: - try: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = randomforest_usecase() - uc.run(save=True) - res, best = uc.get_results() - clf = RandomForestClassifier(n_estimators=best['n_estimators'], - max_depth=best['max_depth'], - max_features=best['max_features']) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['n_estimators'].append(best['n_estimators']) - results['max_depth'].append(best['max_depth']) - results['max_features'].append(best['max_features']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - except Exception as e: - print("Failed at iteration step {}, reason: {}".format(n, e)) - status = False - - if status: - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'max_features'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) - - def run_adaboost_usecase(self): - print("\n") - print("*"*30) - print("AdaBoost Classifier") - print("*"*30) - print("\n") - hp = { - "n_estimators": { - "domain": "uniform", - "data": [1, 500], - "type": "int" - }, - "learning_rate": { - "domain": "loguniform", - "data": [-10, 2], - "type": "float" - } - } - - self.set_hyperparameter(hp) - self.set_output_dir("adaboost_usecase") - - results = {"iterations": [], "n_estimators": [], "learning_rate": [], "accuracy": [], "losses": [], "duration": []} - status = True - for n in self.iter_sequence: - try: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = adaboost_usecase() - uc.run(save=True) - res, best = uc.get_results() - clf = AdaBoostClassifier(n_estimators=best['n_estimators'], - learning_rate=best['learning_rate']) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['n_estimators'].append(best['n_estimators']) - results['learning_rate'].append(best['learning_rate']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - except Exception as e: - print("Failed at iteration step {}, reason: {}".format(n, e)) - status = False - - if status: - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'learning_rate'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) - - def run_knc_usecase(self): - print("\n") - print("*" * 30) - print("KN Classifier") - print("*" * 30) - print("\n") - hp = { - "n_neighbors": { - "domain": "uniform", - "data": [1, 100], - "type": "int" - }, - "weights": { - "domain": "categorical", - "data": ["uniform", "distance"], - "type": "str" - }, - "algorithm": { - "domain": "categorical", - "data": ["auto", "ball_tree", "kd_tree", "brute"], - "type": "str" - } - } - - self.set_hyperparameter(hp) - self.set_output_dir("knc_usecase") - - results = {"iterations": [], "n_neighbors": [], "weights": [], "algorithm": [], "accuracy": [], "losses": [], "duration": []} - status = True - for n in self.iter_sequence: - try: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = knc_usecase() - uc.run(save=True) - res, best = uc.get_results() - clf = KNeighborsClassifier(n_neighbors=best['n_neighbors'], - weights=best['weights'], - algorithm=best['algorithm']) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['n_neighbors'].append(best['n_neighbors']) - results['weights'].append(best['weights']) - results['algorithm'].append(best['algorithm']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - except Exception as e: - print("Failed at iteration step {}, reason: {}".format(n, e)) - status = False - - if status: - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_neighbors', 'weights', 'algorithm'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) - - def run_gradientboost_usecase(self): - print("\n") - print("*" * 30) - print("GradientBoost Classifier") - print("*" * 30) - print("\n") - hp = { - "n_estimators": { - "domain": "uniform", - "data": [3, 500], - "type": "int" - }, - "learning_rate": { - "domain": "loguniform", - "data": [-10, 2], - "type": "float" - }, - "min_samples_split": { - "domain": "uniform", - "data": [0.0, 1.0], - "type": "float" - }, - "min_samples_leaf": { - "domain": "uniform", - "data": [0.00001, 0.5], - "type": "float" - }, - "max_depth": { - "domain": "uniform", - "data": [1, 50], - "type": "int" - } - } - - self.set_hyperparameter(hp) - self.set_output_dir("gradientboost_usecase") - - results = {"iterations": [], "n_estimators": [], "max_depth": [], - "learning_rate": [], "min_samples_split": [], "min_samples_leaf": [], - "accuracy": [], "losses": [], "duration": []} - status = True - for n in self.iter_sequence: - try: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = gradientboost_usecase() - uc.run(save=True) - res, best = uc.get_results() - clf = GradientBoostingClassifier(n_estimators=best['n_estimators'], - max_depth=best['max_depth'], - learning_rate=best['learning_rate'], - min_samples_split=best['min_samples_split'], - min_samples_leaf=best['min_samples_leaf']) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['n_estimators'].append(best['n_estimators']) - results['max_depth'].append(best['max_depth']) - results['learning_rate'].append(best['learning_rate']) - results['min_samples_split'].append(best['min_samples_split']) - results['min_samples_leaf'].append(best['min_samples_leaf']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - except Exception as e: - print("Failed at iteration step {}, reason: {}".format(n, e)) - status = False - - if status: - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'learning_rate', 'min_samples_split', 'min_samples_leaf'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) - - - def clean_up(self): - if os.path.isdir(self.root): - shutil.rmtree(self.root) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Hyppopy performance test.') - parser.add_argument('-u', '--usecase', type=str, help='usecase to be executed', default='all') - parser.add_argument('-s', '--size', type=str, help='size of the test [small, medium, big]', default='small') - parser.add_argument('-p', '--plugin', type=str, help='plugin to be used', default='hyperopt') - parser.add_argument('-o', '--output', type=str, default=None, help='output path to store result, default is temp on you system') - args = parser.parse_args() - - performance_test = PerformanceTest(root=args.output, size=args.size, plugin=args.plugin) - performance_test.run(usecase=args.usecase) diff --git a/examples/quality_tests.py b/examples/quality_tests.py deleted file mode 100644 index 584cdba..0000000 --- a/examples/quality_tests.py +++ /dev/null @@ -1,345 +0,0 @@ -# DKFZ -# -# -# Copyright (c) German Cancer Research Center, -# Division of Medical and Biological Informatics. -# All rights reserved. -# -# This software is distributed WITHOUT ANY WARRANTY; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. -# -# See LICENSE.txt or http://www.mitk.org for details. -# -# Author: Sven Wanner (s.wanner@dkfz.de) - - - -import os -import sys -import time -import argparse -import tempfile -import numpy as np -import pandas as pd - - -try: - import hyppopy as hp - from hyppopy.globals import ROOT - from hyppopy.VirtualFunction import VirtualFunction -except Exception as e: - sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) - import hyppopy as hp - from hyppopy.globals import ROOT - from hyppopy.VirtualFunction import VirtualFunction - -TEMP = tempfile.gettempdir() -DATADIR = os.path.join(os.path.join(ROOT, os.path.join('hyppopy', 'virtualparameterspace')), "6D") - -vfunc = VirtualFunction() -vfunc.load_images(DATADIR) -minima = vfunc.minima() -# for i in range(6): -# mini = minima[i] -# vfunc.plot(i, title="axis_{} min_x={} min_loss={}".format(str(i).zfill(2), np.mean(mini[0]), mini[1])) - - -def blackboxfunction(data, params): - return sum(vfunc(*params.values())) - - -def getConfig(*args, **kwargs): - if 'output_dir' in kwargs.keys() and kwargs['output_dir'] is not None: - output_dir = kwargs['output_dir'] - else: - output_dir = TEMP - if 'plugin' in kwargs.keys(): - plugin = kwargs['plugin'] - else: - plugin = 'hyperopt' - - max_iterations = 0 - if 'max_iterations' in kwargs.keys(): - max_iterations = kwargs['max_iterations'] - - if len(args) < 6: - print("Missing hyperparameter abortion!") - sys.exit() - - config = { - "hyperparameter": { - "axis_0": { - "domain": "uniform", - "data": args[0], - "type": "float" - }, - "axis_1": { - "domain": "uniform", - "data": args[1], - "type": "float" - }, - "axis_2": { - "domain": "uniform", - "data": args[2], - "type": "float" - }, - "axis_3": { - "domain": "uniform", - "data": args[3], - "type": "float" - }, - "axis_4": { - "domain": "uniform", - "data": args[4], - "type": "float" - }, - "axis_5": { - "domain": "uniform", - "data": args[5], - "type": "float" - } - }, - "settings": { - "solver_plugin": { - "max_iterations": max_iterations, - "use_plugin": plugin, - "output_dir": output_dir - } - } - } - return config - - -def test_randomsearch(output_dir): - print("#" * 30) - print("# RANDOMSEARCH") - print("# output_dir={}".format(output_dir)) - print("#" * 30) - - ranges = [[0, 1], - [0, 800], - [-1, 1], - [0, 5], - [0, 10000], - [0, 10]] - args = {'plugin': 'randomsearch', 'output_dir': output_dir} - config = getConfig(*ranges, **args) - return config - - -def test_hyperopt(output_dir): - print("#" * 30) - print("# HYPEROPT") - print("# output_dir={}".format(output_dir)) - print("#" * 30) - - ranges = [[0, 1], - [0, 800], - [-1, 1], - [0, 5], - [0, 10000], - [0, 10]] - args = {'plugin': 'hyperopt', 'output_dir': output_dir} - config = getConfig(*ranges, **args) - return config - - -def test_optunity(output_dir): - print("#" * 30) - print("# OPTUNITY") - print("# output_dir={}".format(output_dir)) - print("#" * 30) - - ranges = [[0, 1], - [0, 800], - [-1, 1], - [0, 5], - [0, 10000], - [0, 10]] - args = {'plugin': 'optunity', 'output_dir': output_dir} - config = getConfig(*ranges, **args) - return config - - -def analyse_iteration_characteristics(configs): - N = 50 - num_of_iterations = [5, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000] - results = {'iteration': [], - 'time_overhead': [], - 'time_overhead_std': [], - 'accuracy': [], - 'accuracy_std': [], - 'plugin': []} - - accuracies = {} - time_overheads = {} - for plugin in configs.keys(): - accuracies[plugin] = [] - time_overheads[plugin] = [] - - for it in num_of_iterations: - for plugin, config in configs.items(): - print("\riteration loop: {} for plugin {}".format(it, plugin)) - for p, v in accuracies.items(): - v.clear() - for p, v in time_overheads.items(): - v.clear() - for n in range(N): - print("\rrepeat loop: {}".format(n), end="") - config["settings"]["solver_plugin"]["max_iterations"] = it - if not hp.ProjectManager.set_config(config): - print("Invalid config dict!") - sys.exit() - - solver = hp.SolverFactory.get_solver() - solver.set_loss_function(blackboxfunction) - solver.set_data(None) - - start = time.process_time() - solver.run() - end = time.process_time() - time_overheads[plugin].append(end-start) - res, best = solver.get_results() - best_loss = 0 - for i, p in enumerate(best.items()): - best_loss += minima[i][1] - reached_loss = np.min(res["losses"].values) - accuracies[plugin].append(100.0/best_loss*reached_loss) - - print("\r") - results['iteration'].append(it) - results['time_overhead'].append(np.mean(time_overheads[plugin])) - results['accuracy'].append(np.mean(accuracies[plugin])) - results['time_overhead_std'].append(np.std(time_overheads[plugin])) - results['accuracy_std'].append(np.std(accuracies[plugin])) - results['plugin'].append(plugin) - - return results - - -def analyse_random_normal_search(output_dir): - config = { - "hyperparameter": { - "axis_0": { - "domain": "normal", - "data": [0.0, 0.2], - "type": "float" - }, - "axis_1": { - "domain": "normal", - "data": [500, 700], - "type": "float" - }, - "axis_2": { - "domain": "normal", - "data": [-0.2, 0.9], - "type": "float" - }, - "axis_3": { - "domain": "normal", - "data": [0.0, 3.0], - "type": "float" - }, - "axis_4": { - "domain": "normal", - "data": [6000, 10000], - "type": "float" - }, - "axis_5": { - "domain": "normal", - "data": [3, 7], - "type": "float" - } - }, - "settings": { - "solver_plugin": { - "max_iterations": 0, - "use_plugin": 'randomsearch', - "output_dir": output_dir - } - } - } - - N = 50 - num_of_iterations = [5, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000] - - results = {'iteration': [], - 'time_overhead': [], - 'time_overhead_std': [], - 'accuracy': [], - 'accuracy_std': []} - - accuracies = [] - time_overheads = [] - for it in num_of_iterations: - config["settings"]["solver_plugin"]["max_iterations"] = it - print("\riteration loop: {}".format(it)) - accuracies.clear() - time_overheads.clear() - for n in range(N): - print("\rrepeat loop: {}".format(n), end="") - if not hp.ProjectManager.set_config(config): - print("Invalid config dict!") - sys.exit() - - solver = hp.SolverFactory.get_solver() - solver.set_loss_function(blackboxfunction) - solver.set_data(None) - - start = time.process_time() - solver.run() - end = time.process_time() - time_overheads.append(end - start) - res, best = solver.get_results() - best_loss = 0 - for i, p in enumerate(best.items()): - best_loss += minima[i][1] - reached_loss = np.min(res["losses"].values) - accuracies.append(100.0 / best_loss * reached_loss) - - print("\r") - results['iteration'].append(it) - results['time_overhead'].append(np.mean(time_overheads)) - results['accuracy'].append(np.mean(accuracies)) - results['time_overhead_std'].append(np.std(time_overheads)) - results['accuracy_std'].append(np.std(accuracies)) - - return results - - -if __name__ == "__main__": - print("") - parser = argparse.ArgumentParser(description='Hyppopy Quality Test Executable') - parser.add_argument('-o', '--output', type=str, default=None, help='output path to store result') - parser.add_argument('-p', '--plugin', type=str, default=None, help='if set analysis is only executed on this plugin') - args = parser.parse_args() - - do_analyse_iteration_characteristics = True - do_analyse_random_normal_search = False - - funcs = [x for x in locals().keys() if x.startswith("test_")] - configs = {} - for f in funcs: - if args.plugin is not None: - if not f.endswith(args.plugin): - continue - configs[f.split("_")[1]] = locals()[f](args.output) - - if do_analyse_iteration_characteristics: - start = time.process_time() - data = analyse_iteration_characteristics(configs) - end = time.process_time() - print("Total duration analyse_iteration_characteristics: {} min".format((end-start)/60)) - df = pd.DataFrame.from_dict(data) - fname = os.path.join(args.output, "analyse_iteration_characteristics.csv") - df.to_csv(fname, index=False) - - if do_analyse_random_normal_search: - start = time.process_time() - data = analyse_random_normal_search(args.output) - end = time.process_time() - print("Total duration analyse_random_normal_search: {} min".format((end - start) / 60)) - df = pd.DataFrame.from_dict(data) - fname = os.path.join(args.output, "analyse_random_normal_search.csv") - df.to_csv(fname, index=False) diff --git a/examples/solver_comparison.py b/examples/solver_comparison.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/tutorial_custom_visualization.py b/examples/tutorial_custom_visualization.py index c4137b3..9237f13 100644 --- a/examples/tutorial_custom_visualization.py +++ b/examples/tutorial_custom_visualization.py @@ -1,101 +1,92 @@ import matplotlib.pylab as plt +from hyppopy.SolverPool import SolverPool from hyppopy.HyppopyProject import HyppopyProject from hyppopy.VirtualFunction import VirtualFunction from hyppopy.BlackboxFunction import BlackboxFunction -from hyppopy.solver.HyperoptSolver import HyperoptSolver -from hyppopy.solver.OptunitySolver import OptunitySolver -from hyppopy.solver.RandomsearchSolver import RandomsearchSolver project = HyppopyProject() project.add_hyperparameter(name="axis_00", domain="uniform", data=[0, 1], dtype="float") project.add_hyperparameter(name="axis_01", domain="uniform", data=[0, 800], dtype="float") project.add_hyperparameter(name="axis_02", domain="uniform", data=[0, 5], dtype="float") project.add_hyperparameter(name="axis_03", domain="uniform", data=[1, 10000], dtype="float") project.add_hyperparameter(name="axis_04", domain="uniform", data=[0, 10], dtype="float") project.add_settings(section="solver", name="max_iterations", value=500) project.add_settings(section="custom", name="use_solver", value="randomsearch") plt.ion() fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), sharey=True) plot_data = {"iterations": [], "loss": [], "axis_00": [], "axis_01": [], "axis_02": [], "axis_03": [], "axis_04": []} def my_visualization_function(**kwargs): print("\r{}".format(kwargs), end="") plot_data["iterations"].append(kwargs['iterations']) plot_data["loss"].append(kwargs['loss']) plot_data["axis_00"].append(kwargs['axis_00']) plot_data["axis_01"].append(kwargs['axis_01']) plot_data["axis_02"].append(kwargs['axis_02']) plot_data["axis_03"].append(kwargs['axis_03']) plot_data["axis_04"].append(kwargs['axis_04']) axes[0, 0].clear() axes[0, 0].scatter(plot_data["axis_00"], plot_data["loss"], c=plot_data["loss"], cmap="jet", marker='.') axes[0, 0].set_ylabel("loss") axes[0, 0].set_xlabel("axis_00") axes[0, 1].clear() axes[0, 1].scatter(plot_data["axis_01"], plot_data["loss"], c=plot_data["loss"], cmap="jet", marker='.') axes[0, 1].set_xlabel("axis_01") axes[0, 2].clear() axes[0, 2].scatter(plot_data["axis_02"], plot_data["loss"], c=plot_data["loss"], cmap="jet", marker='.') axes[0, 2].set_xlabel("axis_02") axes[1, 0].clear() axes[1, 0].scatter(plot_data["axis_03"], plot_data["loss"], c=plot_data["loss"], cmap="jet", marker='.') axes[1, 0].set_ylabel("loss") axes[1, 0].set_xlabel("axis_03") axes[1, 1].clear() axes[1, 1].scatter(plot_data["axis_04"], plot_data["loss"], c=plot_data["loss"], cmap="jet", marker='.') axes[1, 1].set_xlabel("axis_04") axes[1, 2].clear() axes[1, 2].plot(plot_data["iterations"], plot_data["loss"], "--", c=(0.8, 0.8, 0.8, 0.5)) axes[1, 2].scatter(plot_data["iterations"], plot_data["loss"], marker='.', c=(0.2, 0.2, 0.2)) axes[1, 2].set_xlabel("iterations") plt.draw() plt.tight_layout() plt.pause(0.001) def my_loss_function(data, params): vfunc = VirtualFunction() vfunc.load_default(5) return vfunc(**params) blackbox = BlackboxFunction(data=[], blackbox_func=my_loss_function, callback_func=my_visualization_function) -if project.custom_use_solver == "hyperopt": - solver = HyperoptSolver(project) -elif project.custom_use_solver == "optunity": - solver = OptunitySolver(project) -elif project.custom_use_solver == "randomsearch": - solver = RandomsearchSolver(project) - -if solver is not None: - solver.blackbox = blackbox +solver = SolverPool.get(project=project) +solver.blackbox = blackbox solver.run() df, best = solver.get_results() print("\n") print("*" * 100) print("Best Parameter Set:\n{}".format(best)) print("*" * 100) print("") save_plot = input("Save Plot? [y/n] ") if save_plot == "y": plt.savefig('plot_{}.png'.format(project.custom_use_solver)) diff --git a/examples/tutorial_multisolver.py b/examples/tutorial_multisolver.py index 2cf6196..5483ccf 100644 --- a/examples/tutorial_multisolver.py +++ b/examples/tutorial_multisolver.py @@ -1,182 +1,179 @@ # In this tutorial we solve an optimization problem using the Hyperopt Solver (http://hyperopt.github.io/hyperopt/). # Hyperopt uses a Baysian - Tree Parzen Estimator - Optimization approach, which means that each iteration computes a # new function value of the blackbox, interpolates a guess for the whole energy function and predicts a point to # compute the next function value at. This next point is not necessarily a "better" value, it's only the value with # the highest uncertainty for the function interpolation. # # See a visual explanation e.g. here (http://philipperemy.github.io/visualization/) # import the HyppopyProject class keeping track of inputs from hyppopy.HyppopyProject import HyppopyProject -# import the HyoppopySolver classes -from hyppopy.solver.HyperoptSolver import HyperoptSolver -from hyppopy.solver.OptunitySolver import OptunitySolver -from hyppopy.solver.RandomsearchSolver import RandomsearchSolver +# import the SolverPool singleton class +from hyppopy.SolverPool import SolverPool # import the Blackboxfunction class wrapping your problem for Hyppopy from hyppopy.BlackboxFunction import BlackboxFunction # Next step is defining the problem space and all settings Hyppopy needs to optimize your problem. # The config is a simple nested dictionary with two obligatory main sections, hyperparameter and settings. # The hyperparameter section defines your searchspace. Each hyperparameter is again a dictionary with: # # - a domain ['categorical', 'uniform', 'normal', 'loguniform'] # - the domain data [left bound, right bound] and # - a type of your domain ['str', 'int', 'float'] # # The settings section has two subcategories, solver and custom. The first contains settings for the solver, # here 'max_iterations' - is the maximum number of iteration. # # The custom section allows defining custom parameter. An entry here is transformed to a member variable of the # HyppopyProject class. These can be useful when implementing new solver classes or for control your hyppopy script. # Here we use it as a solver switch to control the usage of our solver via the config. This means with the script # below your can try out every solver by changing use_solver to 'optunity', 'randomsearch', 'gridsearch',... # It can be used like so: project.custom_use_plugin (see below) If using the gridsearch solver, max_iterations is # ignored, instead each hyperparameter must specifiy a number of samples additionally to the range like so: # 'data': [0, 1, 100] which means sampling the space from 0 to 1 in 100 intervals. config = { "hyperparameter": { "C": { "domain": "uniform", "data": [0.0001, 20], "type": "float" }, "gamma": { "domain": "uniform", "data": [0.0001, 20.0], "type": "float" }, "kernel": { "domain": "categorical", "data": ["linear", "sigmoid", "poly", "rbf"], "type": "str" }, "decision_function_shape": { "domain": "categorical", "data": ["ovo", "ovr"], "type": "str" } }, "settings": { "solver": { "max_iterations": 300 }, "custom": { "use_solver": "hyperopt" } }} # When creating a HyppopyProject instance we # pass the config dictionary to the constructor. project = HyppopyProject(config=config) # demonstration of the custom parameter access print("-"*30) print("max_iterations:\t{}".format(project.solver_max_iterations)) print("solver chosen -> {}".format(project.custom_use_solver)) print("-"*30) # Hyppopy offers a class called BlackboxFunction to wrap your problem for Hyppopy. # The function signature is as follows: # BlackboxFunction(blackbox_func=None, # dataloader_func=None, # preprocess_func=None, # callback_func=None, # data=None, # **kwargs) # # Means we can set a couple of function pointers, a data object and an arbitrary number of custom parameter via kwargs. # # - blackbox_func: a function pointer to the actual, user defined, blackbox function that is computing our loss # - dataloader_func: a function pointer to a function handling the dataloading # - preprocess_func: a function pointer to a function automatically executed before starting the optimization process # - callback_func: a function pointer to a function that is called after each iteration with the trail object as input # - data: setting data can be done via dataloader_func or directly # - kwargs are passed to all functions above and thus can be used for parameter sharing between the functions # # (more details see in the documentation) # # Below we demonstrate the usage of all the above by defining a my_dataloader_function which in fact only grabs the # iris dataset from sklearn and returns it. A my_preprocess_function which also does nothing useful here but # demonstrating that a custom parameter can be set via kwargs and used in all of our functions when called within # Hyppopy. The my_callback_function gets as input the dictionary containing the state of the iteration and thus can be # used to access the current state of each solver iteration. Finally we define the actual loss_function # my_loss_function, which gets as input a data object and params. Both parameter are fixed, the first is defined by # the user depending on what is dataloader returns or the data object set in the constructor, the second is a dictionary # with a sample of your hyperparameter space which content is in the choice of the solver. from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.model_selection import cross_val_score def my_dataloader_function(**kwargs): print("Dataloading...") # kwargs['params'] allows accessing additional parameter passed, see below my_preproc_param, my_dataloader_input. print("my loading argument: {}".format(kwargs['params']['my_dataloader_input'])) iris_data = load_iris() return [iris_data.data, iris_data.target] def my_preprocess_function(**kwargs): print("Preprocessing...") # kwargs['data'] allows accessing the input data print("data:", kwargs['data'][0].shape, kwargs['data'][1].shape) # kwargs['params'] allows accessing additional parameter passed, see below my_preproc_param, my_dataloader_input. print("kwargs['params']['my_preproc_param']={}".format(kwargs['params']['my_preproc_param']), "\n") # if the preprocessing function returns something, # the input data will be replaced with the data returned by this function. x = kwargs['data'][0] y = kwargs['data'][1] for i in range(x.shape[0]): x[i, :] += kwargs['params']['my_preproc_param'] return [x, y] def my_callback_function(**kwargs): print("\r{}".format(kwargs), end="") def my_loss_function(data, params): clf = SVC(**params) return -cross_val_score(estimator=clf, X=data[0], y=data[1], cv=3).mean() # We now create the BlackboxFunction object and pass all function pointers defined above, # as well as 2 dummy parameter (my_preproc_param, my_dataloader_input) for demonstration purposes. blackbox = BlackboxFunction(blackbox_func=my_loss_function, dataloader_func=my_dataloader_function, preprocess_func=my_preprocess_function, callback_func=my_callback_function, my_preproc_param=1, my_dataloader_input='could/be/a/path') -# Last step, is we use our use_solver config parameter defined in the custom section to decide which solver -# should be used, create the solver instance respectively, give it the blackbox and run it. After execution -# we can get the result via get_result() which returns a pandas dataframe containing the complete history and -# a dict best containing the best parameter set. - -if project.custom_use_solver == "hyperopt": - solver = HyperoptSolver(project) -elif project.custom_use_solver == "optunity": - solver = OptunitySolver(project) -elif project.custom_use_solver == "randomsearch": - solver = RandomsearchSolver(project) +# Last step, is we use our SolverPool which automatically returns the correct solver. +# There are multiple ways to get the desired solver from the solver pool. +# 1. solver = SolverPool.get('hyperopt') +# 2. solver.project = project +# 3. solver = SolverPool.get('hyperopt', project) +# 4. The SolverPool will look for the field 'use_solver' in the project instance, if +# set it will be used to specify the solver and it is enough to pass the project. +solver = SolverPool.get(project=project) -if solver is not None: - solver.blackbox = blackbox +# Give the solver your blackbox and run it. After execution we can get the result +# via get_result() which returns a pandas dataframe containing the complete history and +# a dict best containing the best parameter set. +solver.blackbox = blackbox solver.run() df, best = solver.get_results() print("\n") print("*"*100) print("Best Parameter Set:\n{}".format(best)) print("*"*100) diff --git a/hyppopy/SolverPool.py b/hyppopy/SolverPool.py new file mode 100644 index 0000000..e10862c --- /dev/null +++ b/hyppopy/SolverPool.py @@ -0,0 +1,62 @@ +# DKFZ +# +# +# Copyright (c) German Cancer Research Center, +# Division of Medical and Biological Informatics. +# All rights reserved. +# +# This software is distributed WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# See LICENSE.txt or http://www.mitk.org for details. +# +# Author: Sven Wanner (s.wanner@dkfz.de) + +from .Singleton import * + +import os +import logging +from hyppopy.HyppopyProject import HyppopyProject +from hyppopy.solver.HyperoptSolver import HyperoptSolver +from hyppopy.solver.OptunitySolver import OptunitySolver +from hyppopy.solver.GridsearchSolver import GridsearchSolver +from hyppopy.solver.RandomsearchSolver import RandomsearchSolver +from hyppopy.globals import DEBUGLEVEL + +LOG = logging.getLogger(os.path.basename(__file__)) +LOG.setLevel(DEBUGLEVEL) + + +@singleton_object +class SolverPool(metaclass=Singleton): + + def __init__(self): + pass + + def get(self, solver_name=None, project=None): + if solver_name is not None: + assert isinstance(solver_name, str), "precondition violation, solver_name type str expected, got {} instead!".format(type(solver_name)) + if project is not None: + assert isinstance(project, HyppopyProject), "precondition violation, project type HyppopyProject expected, got {} instead!".format(type(project)) + if "custom_use_solver" in project.__dict__: + solver_name = project.custom_use_solver + + if solver_name == "hyperopt": + if project is not None: + return HyperoptSolver(project) + return HyperoptSolver() + elif solver_name == "optunity": + if project is not None: + return OptunitySolver(project) + return OptunitySolver() + elif solver_name == "gridsearch": + if project is not None: + return GridsearchSolver(project) + return GridsearchSolver() + elif solver_name == "randomsearch": + if project is not None: + return RandomsearchSolver(project) + return RandomsearchSolver() + else: + raise AssertionError("Solver named [{}] not implemented!".format(solver_name))