diff --git a/__main__.py b/__main__.py index 552b78d..dae4064 100644 --- a/__main__.py +++ b/__main__.py @@ -1,95 +1,95 @@ #!/usr/bin/env python # # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import sys ROOT = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(ROOT) from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase -from hyppopy.workflows.lda_usecase.lda_usecase import lda_usecase +from hyppopy.workflows.lda_usecase.adaboost_usecase import lda_usecase from hyppopy.workflows.unet_usecase.unet_usecase import unet_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase from hyppopy.workflows.imageregistration_usecase.imageregistration_usecase import imageregistration_usecase import os import sys import time import argparse def print_warning(msg): print("\n!!!!! WARNING !!!!!") print(msg) sys.exit() def args_check(args): if not args.workflow: print_warning("No workflow specified, check --help") if not args.config: print_warning("Missing config parameter, check --help") if not os.path.isfile(args.config): print_warning(f"Couldn't find configfile ({args.config}), please check your input --config") if __name__ == "__main__": parser = argparse.ArgumentParser(description='UNet Hyppopy UseCase Example Optimization.') parser.add_argument('-w', '--workflow', type=str, help='workflow to be executed') parser.add_argument('-o', '--output', type=str, default=None, help='output path to store result') parser.add_argument('-c', '--config', type=str, help='config filename, .xml or .json formats are supported.' 'pass a full path filename or the filename only if the' 'configfile is in the data folder') args = parser.parse_args() args_check(args) ProjectManager.read_config(args.config) if args.output is not None: ProjectManager.register_member("output_dir", args.output) if args.workflow == "svc_usecase": uc = svc_usecase() elif args.workflow == "randomforest_usecase": uc = randomforest_usecase() elif args.workflow == "knc_usecase": uc = knc_usecase() elif args.workflow == "lda_usecase": uc = lda_usecase() elif args.workflow == "unet_usecase": uc = unet_usecase() elif args.workflow == "imageregistration_usecase": uc = imageregistration_usecase() else: print("No workflow called {} found!".format(args.workflow)) sys.exit() print("\nStart optimization...") start = time.process_time() uc.run(save=True) end = time.process_time() print("Finished optimization!\n") print("Total Time: {}s\n".format(end-start)) res, best = uc.get_results() print("---- Optimal Parameter -----\n") for p in best.items(): print(" - {}\t:\t{}".format(p[0], p[1])) diff --git a/examples/performance_test.py b/examples/performance_test.py new file mode 100644 index 0000000..26900c0 --- /dev/null +++ b/examples/performance_test.py @@ -0,0 +1,350 @@ +# DKFZ +# +# +# Copyright (c) German Cancer Research Center, +# Division of Medical and Biological Informatics. +# All rights reserved. +# +# This software is distributed WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# See LICENSE.txt or http://www.mitk.org for details. +# +# Author: Sven Wanner (s.wanner@dkfz.de) + +import os +import shutil +import tempfile +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.svm import SVC +import matplotlib.pyplot as plt +from sklearn.metrics import accuracy_score +from sklearn.ensemble import AdaBoostClassifier +from sklearn.datasets import load_breast_cancer +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split + +from hyppopy.projectmanager import ProjectManager +from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase +from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase +from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase +from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase + +sns.set(style="ticks") +sns.set(style="darkgrid") + + +class PerformanceTest(object): + + def __init__(self): + self.root = os.path.join(tempfile.gettempdir(), 'test_data') + if not os.path.isdir(self.root): + os.makedirs(self.root) + self.test = None + self.train = None + self.config = None + self.iter_sequence = [5, 10, 25, 50, 100, 150, 300, 500] + + def run(self): + self.set_up() + #self.run_svc_usecase() + self.run_randomforest_usecase() + self.run_adaboost_usecase() + self.run_knc_usecase() + #self.clean_up() + + def set_hyperparameter(self, params): + self.config["hyperparameter"] = params + + def set_iterations(self, value): + self.config["settings"]["solver_plugin"]["max_iterations"] = value + + def plot(self, df, name=""): + sns_plot = sns.pairplot(df, height=1.8, aspect=1.8) + + fig = sns_plot.fig + fig.subplots_adjust(top=0.93, wspace=0.3) + t = fig.suptitle(name, fontsize=14) + plt.show() + return sns_plot + + def set_up(self): + breast_cancer_data = load_breast_cancer() + x = breast_cancer_data.data + y = breast_cancer_data.target + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23) + + x_train_fname = os.path.join(self.root, 'x_train.npy') + y_train_fname = os.path.join(self.root, 'y_train.npy') + if not os.path.isfile(x_train_fname): + np.save(x_train_fname, x_train) + if not os.path.isfile(y_train_fname): + np.save(y_train_fname, y_train) + + self.train = [x_train, y_train] + self.test = [x_test, y_test] + self.config = { + "hyperparameter": {}, + "settings": { + "solver_plugin": { + "max_iterations": 50, + "use_plugin": "hyperopt", + "output_dir": os.path.join(self.root, 'test_results') + }, + "custom": { + "data_path": self.root, + "data_name": "x_train.npy", + "labels_name": "y_train.npy" + } + }} + + def run_svc_usecase(self): + print("\n") + print("*" * 30) + print("SVC Classifier") + print("*" * 30) + print("\n") + hp = { + "C": { + "domain": "uniform", + "data": [0.0001, 300.0], + "type": "float" + }, + "kernel": { + "domain": "categorical", + "data": ["linear", "poly", "rbf"], + "type": "str" + } + } + + self.set_hyperparameter(hp) + + results = {"iterations": [], "C": [], "kernel": [], "accuracy": []} + for n in self.iter_sequence: + self.set_iterations(n) + ProjectManager.set_config(self.config) + uc = svc_usecase() + uc.run(save=False) + res, best = uc.get_results() + clf = SVC(C=best['n_estimators'], + kernel=hp['kernel']['data'][best['kernel']]) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + + results['accuracy'].append(acc) + results['iterations'].append(n) + results['kernel'].append(best['kernel']) + results['C'].append(best['C']) + + print("=" * 30) + print("Number of iterations: {}".format(n)) + print("Classifier: {}".format(clf.__class__.__name__)) + print("=" * 30) + print("=" * 30) + for p in best.items(): + print(p[0], ":", p[1]) + print("=" * 30) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) + print("\n") + + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + sns_plot = self.plot(df, name="Classifier: {}".format(clf.__class__.__name__)) + sns_plot.savefig(os.path.join(self.root, "final_{}.png".format(clf.__class__.__name__))) + + def run_randomforest_usecase(self): + print("\n") + print("*" * 30) + print("RandomForest Classifier") + print("*" * 30) + print("\n") + hp = { + "n_estimators": { + "domain": "uniform", + "data": [3, 500], + "type": "int" + }, + "max_depth": { + "domain": "uniform", + "data": [1, 50], + "type": "int" + }, + "max_features": { + "domain": "categorical", + "data": ["auto", "sqrt", "log2"], + "type": "str" + } + } + + self.set_hyperparameter(hp) + + results = {"iterations": [], "n_estimators": [], "max_depth": [], "max_features": [], "accuracy": []} + for n in self.iter_sequence: + self.set_iterations(n) + ProjectManager.set_config(self.config) + uc = randomforest_usecase() + uc.run(save=False) + res, best = uc.get_results() + clf = RandomForestClassifier(n_estimators=best['n_estimators'], + max_depth=best['max_depth'], + max_features=hp['max_features']['data'][best['max_features']]) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + + results['accuracy'].append(acc) + results['iterations'].append(n) + results['n_estimators'].append(best['n_estimators']) + results['max_depth'].append(best['max_depth']) + results['max_features'].append(best['max_features']) + + print("=" * 30) + print("Number of iterations: {}".format(n)) + print("Classifier: {}".format(clf.__class__.__name__)) + print("=" * 30) + print("=" * 30) + for p in best.items(): + print(p[0], ":", p[1]) + print("=" * 30) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) + print("\n") + + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + sns_plot = self.plot(df, name="Classifier: {}".format(clf.__class__.__name__)) + sns_plot.savefig(os.path.join(self.root, "final_{}.png".format(clf.__class__.__name__))) + + def run_adaboost_usecase(self): + print("\n") + print("*"*30) + print("AdaBoost Classifier") + print("*"*30) + print("\n") + hp = { + "n_estimators": { + "domain": "uniform", + "data": [1, 500], + "type": "int" + }, + "learning_rate": { + "domain": "uniform", + "data": [0.001, 10], + "type": "float" + } + } + + self.set_hyperparameter(hp) + + results = {"iterations": [], "n_estimators": [], "learning_rate": [], "accuracy": []} + for n in self.iter_sequence: + self.set_iterations(n) + ProjectManager.set_config(self.config) + uc = adaboost_usecase() + uc.run(save=False) + res, best = uc.get_results() + clf = AdaBoostClassifier(n_estimators=best['n_estimators'], + learning_rate=best['learning_rate']) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + + results['accuracy'].append(acc) + results['iterations'].append(n) + results['n_estimators'].append(best['n_estimators']) + results['learning_rate'].append(best['learning_rate']) + + print("=" * 30) + print("Number of iterations: {}".format(n)) + print("Classifier: {}".format(clf.__class__.__name__)) + print("=" * 30) + print("=" * 30) + for p in best.items(): + print(p[0], ":", p[1]) + print("=" * 30) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) + print("\n") + + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + sns_plot = self.plot(df, name="Classifier: {}".format(clf.__class__.__name__)) + sns_plot.savefig(os.path.join(self.root, "final_{}.png".format(clf.__class__.__name__))) + + def run_knc_usecase(self): + print("\n") + print("*" * 30) + print("KN Classifier") + print("*" * 30) + print("\n") + hp = { + "n_neighbors": { + "domain": "uniform", + "data": [1, 100], + "type": "int" + }, + "weights": { + "domain": "categorical", + "data": ["uniform", "distance"], + "type": "str" + }, + "algorithm": { + "domain": "categorical", + "data": ["auto", "ball_tree", "kd_tree", "brute"], + "type": "str" + } + } + + self.set_hyperparameter(hp) + + results = {"iterations": [], "n_neighbors": [], "weights": [], "algorithm": [], "accuracy": []} + for n in self.iter_sequence: + self.set_iterations(n) + ProjectManager.set_config(self.config) + uc = knc_usecase() + uc.run(save=False) + res, best = uc.get_results() + clf = KNeighborsClassifier(n_neighbors=best['n_neighbors'], + weights=hp["weights"]["data"][best['weights']], + algorithm=hp["algorithm"]["data"][best['algorithm']]) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + + results['accuracy'].append(acc) + results['iterations'].append(n) + results['n_neighbors'].append(best['n_neighbors']) + results['weights'].append(best['weights']) + results['algorithm'].append(best['algorithm']) + + print("=" * 30) + print("Number of iterations: {}".format(n)) + print("Classifier: {}".format(clf.__class__.__name__)) + print("=" * 30) + print("=" * 30) + for p in best.items(): + print(p[0], ":", p[1]) + print("=" * 30) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) + print("\n") + + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + sns_plot = self.plot(df, name="Classifier: {}".format(clf.__class__.__name__)) + sns_plot.savefig(os.path.join(self.root, "final_{}.png".format(clf.__class__.__name__))) + + def clean_up(self): + if os.path.isdir(self.root): + shutil.rmtree(self.root) + + +if __name__ == "__main__": + performance_test = PerformanceTest() + performance_test.run() diff --git a/hyppopy/plugins/optunity_solver_plugin.py b/hyppopy/plugins/optunity_solver_plugin.py index 1039874..82f4215 100644 --- a/hyppopy/plugins/optunity_solver_plugin.py +++ b/hyppopy/plugins/optunity_solver_plugin.py @@ -1,67 +1,68 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import logging from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) from pprint import pformat import optunity from yapsy.IPlugin import IPlugin from hyppopy.projectmanager import ProjectManager from hyppopy.solverpluginbase import SolverPluginBase class optunity_Solver(SolverPluginBase, IPlugin): + solver_info = None trials = None best = None status = None def __init__(self): SolverPluginBase.__init__(self) LOG.debug("initialized") def blackbox_function(self, **params): try: for key in params.keys(): if self.settings.get_type_of(key) == 'int': params[key] = int(round(params[key])) loss = self.blackbox_function_template(self.data, params) self.status.append('ok') return loss except Exception as e: LOG.error("computing loss failed due to:\n {}".format(e)) self.status.append('fail') return 1e9 def execute_solver(self, parameter): LOG.debug("execute_solver using solution space:\n\n\t{}\n".format(pformat(parameter))) self.status = [] try: self.best, self.trials, self.solver_info = optunity.minimize_structured(f=self.blackbox_function, num_evals=ProjectManager.max_iterations, search_space=parameter) except Exception as e: LOG.error("internal error in optunity.minimize_structured occured. {}".format(e)) raise BrokenPipeError("internal error in optunity.minimize_structured occured. {}".format(e)) def convert_results(self): results = self.trials.call_log['args'] results['losses'] = self.trials.call_log['values'] return results, self.best diff --git a/hyppopy/resultviewer.py b/hyppopy/resultviewer.py index 52da45a..d39c640 100644 --- a/hyppopy/resultviewer.py +++ b/hyppopy/resultviewer.py @@ -1,83 +1,87 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import logging from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) sns.set(style="darkgrid") class ResultViewer(object): def __init__(self, fname=None, save_only=False): self.df = None self.has_duration = False self.hyperparameter = None self.save_only = save_only self.path = None self.appendix = None if fname is not None: self.read(fname) + def close_all(self): + plt.close('all') + def read(self, fname): self.path = os.path.dirname(fname) split = os.path.basename(fname).split("_") self.appendix = split[-1] self.appendix = self.appendix[:-4] self.df = pd.read_csv(fname, index_col=0) const_data = ["duration", "losses"] hyperparameter_columns = [item for item in self.df.columns if item not in const_data] self.hyperparameter = pd.DataFrame() for key in hyperparameter_columns: self.hyperparameter[key] = self.df[key] self.has_duration = "duration" in self.df.columns def show(self, save=True): if self.has_duration: sns_plot = sns.jointplot(y="duration", x="losses", data=self.df, kind="kde") if not self.save_only: plt.show() if save: save_name = os.path.join(self.path, "t_vs_loss_"+self.appendix+".png") try: sns_plot.savefig(save_name) except Exception as e: msg = "failed to save file {}, reason {}".format(save_name, e) LOG.error(msg) raise IOError(msg) sns_plot = sns.pairplot(self.df, height=1.8, aspect=1.8, plot_kws=dict(edgecolor="k", linewidth=0.5), diag_kind="kde", diag_kws=dict(shade=True)) fig = sns_plot.fig fig.subplots_adjust(top=0.93, wspace=0.3) t = fig.suptitle('Pairwise Plots', fontsize=14) if not self.save_only: plt.show() if save: save_name = os.path.join(self.path, "matrixview_"+self.appendix+".png") try: sns_plot.savefig(save_name) except Exception as e: msg = "failed to save file {}, reason {}".format(save_name, e) LOG.error(msg) raise IOError(msg) + diff --git a/hyppopy/tests/data/Iris/rf_config.json b/hyppopy/tests/data/Iris/rf_config.json index baa11c3..92a7b99 100644 --- a/hyppopy/tests/data/Iris/rf_config.json +++ b/hyppopy/tests/data/Iris/rf_config.json @@ -1,44 +1,29 @@ {"hyperparameter": { "n_estimators": { "domain": "uniform", "data": "[3,500]", "type": "int" }, "criterion": { "domain": "categorical", "data": "[gini,entropy]", "type": "str" }, "max_depth": { "domain": "uniform", "data": "[3, 50]", "type": "int" - }, - "min_samples_split": { - "domain": "uniform", - "data": "[0.0001,1]", - "type": "float" - }, - "min_samples_leaf": { - "domain": "uniform", - "data": "[0.0001,0.5]", - "type": "float" - }, - "max_features": { - "domain": "categorical", - "data": "[auto,sqrt,log2]", - "type": "str" } }, "settings": { "solver_plugin": { "max_iterations": "3", "use_plugin" : "optunity", "output_dir": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris" }, "custom": { "data_path": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris", "data_name": "train_data.npy", "labels_name": "train_labels.npy" } }} \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/rf_config.xml b/hyppopy/tests/data/Iris/rf_config.xml index 23c7747..8646864 100644 --- a/hyppopy/tests/data/Iris/rf_config.xml +++ b/hyppopy/tests/data/Iris/rf_config.xml @@ -1,46 +1,31 @@ uniform [3,200] int categorical [gini,entropy] str uniform [3, 50] int - - uniform - [0.0001,1] - float - - - uniform - [0.0001,0.5] - float - - - categorical - [auto,sqrt,log2] - str - 3 optunity D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris train_data.npy train_labels.npy \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/svc_config.json b/hyppopy/tests/data/Iris/svc_config.json index 02c4fd4..1a19aab 100644 --- a/hyppopy/tests/data/Iris/svc_config.json +++ b/hyppopy/tests/data/Iris/svc_config.json @@ -1,34 +1,29 @@ {"hyperparameter": { "C": { "domain": "uniform", "data": "[0,20]", "type": "float" }, "gamma": { "domain": "uniform", "data": "[0.0001,20.0]", "type": "float" }, "kernel": { "domain": "categorical", - "data": "[linear, sigmoid, poly, rbf]", - "type": "str" - }, - "decision_function_shape": { - "domain": "categorical", - "data": "[ovo,ovr]", + "data": "[linear, poly, rbf]", "type": "str" } }, "settings": { "solver_plugin": { "max_iterations": "3", "use_plugin" : "optunity", "output_dir": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris" }, "custom": { "data_path": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris", "data_name": "train_data.npy", "labels_name": "train_labels.npy" } }} \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/svc_config.xml b/hyppopy/tests/data/Iris/svc_config.xml index cc3bbca..0f018e6 100644 --- a/hyppopy/tests/data/Iris/svc_config.xml +++ b/hyppopy/tests/data/Iris/svc_config.xml @@ -1,36 +1,31 @@ uniform [0,20] float uniform [0.0001,20.0] float categorical - [linear,sigmoid,poly,rbf] + [linear,poly,rbf] str - - categorical - [ovo,ovr] - str - 3 hyperopt D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris train_data.npy train_labels.npy \ No newline at end of file diff --git a/hyppopy/tests/data/Titanic/svc_config.json b/hyppopy/tests/data/Titanic/svc_config.json index 947ed37..67d9eac 100644 --- a/hyppopy/tests/data/Titanic/svc_config.json +++ b/hyppopy/tests/data/Titanic/svc_config.json @@ -1,34 +1,29 @@ {"hyperparameter": { "C": { "domain": "uniform", "data": "[0,20]", "type": "float" }, "gamma": { "domain": "uniform", "data": "[0.0001,20.0]", "type": "float" }, "kernel": { "domain": "categorical", - "data": "[linear, sigmoid, poly, rbf]", - "type": "str" - }, - "decision_function_shape": { - "domain": "categorical", - "data": "[ovo,ovr]", + "data": "[linear, poly, rbf]", "type": "str" } }, "settings": { "solver_plugin": { "max_iterations": "3", "use_plugin" : "hyperopt", "output_dir": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic" }, "custom": { "data_path": "D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic", "data_name": "train_cleaned.csv", "labels_name": "Survived" } }} \ No newline at end of file diff --git a/hyppopy/tests/data/Titanic/svc_config.xml b/hyppopy/tests/data/Titanic/svc_config.xml index 094fcd1..1d491a3 100644 --- a/hyppopy/tests/data/Titanic/svc_config.xml +++ b/hyppopy/tests/data/Titanic/svc_config.xml @@ -1,36 +1,31 @@ uniform [0,20] float uniform [0.0001,20.0] float categorical - [linear,sigmoid,poly,rbf] + [linear,poly,rbf] str - - categorical - [ovo,ovr] - str - 3 optunity D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic train_cleaned.csv Survived \ No newline at end of file diff --git a/hyppopy/tests/test_usecases.py b/hyppopy/tests/test_usecases.py index 0a80e6b..4846a4b 100644 --- a/hyppopy/tests/test_usecases.py +++ b/hyppopy/tests/test_usecases.py @@ -1,166 +1,181 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import shutil import unittest import tempfile import numpy as np +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase -from hyppopy.workflows.lda_usecase.lda_usecase import lda_usecase +from hyppopy.workflows.lda_usecase.adaboost_usecase import lda_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") class ProjectManagerTestSuite(unittest.TestCase): def setUp(self): breast_cancer_data = load_breast_cancer() x = breast_cancer_data.data y = breast_cancer_data.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23) self.root = os.path.join(tempfile.gettempdir(), 'test_data') if not os.path.isdir(self.root): os.makedirs(self.root) x_train_fname = os.path.join(self.root, 'x_train.npy') y_train_fname = os.path.join(self.root, 'y_train.npy') np.save(x_train_fname, x_train) np.save(y_train_fname, y_train) self.test = [x_test, y_test] self.config = { "hyperparameter": {}, "settings": { "solver_plugin": { - "max_iterations": 5, + "max_iterations": 50, "use_plugin": "hyperopt", "output_dir": os.path.join(self.root, 'test_results') }, "custom": { "data_path": self.root, "data_name": "x_train.npy", "labels_name": "y_train.npy" } }} - # def test_svc_usecase(self): - # hyperparameter = { - # "C": { - # "domain": "uniform", - # "data": [0.0001, 300.0], - # "type": "float" - # } - # } - # - # self.config["hyperparameter"] = hyperparameter - # ProjectManager.set_config(self.config) - # uc = svc_usecase() - # uc.run(save=True) - # res, best = uc.get_results() - # print(best) + def test_svc_usecase(self): + hyperparameter = { + "C": { + "domain": "uniform", + "data": [0.0001, 300.0], + "type": "float" + }, + "kernel": { + "domain": "categorical", + "data": ["linear", "poly", "rbf"], + "type": "str" + } + } + + self.config["hyperparameter"] = hyperparameter + ProjectManager.set_config(self.config) + uc = svc_usecase() + uc.run(save=True) + res, best = uc.get_results() + print("="*30) + print(best) + print("=" * 30) + clf = SVC(**best) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) def test_randomforest_usecase(self): hyperparameter = { "n_estimators": { "domain": "uniform", "data": [1, 500], "type": "int" }, "criterion": { "domain": "categorical", "data": ["gini", "entropy"], "type": "str" }, "max_depth": { "domain": "uniform", "data": [1, 50], "type": "int" }, "max_features": { "domain": "categorical", "data": ["auto", "sqrt", "log2"], "type": "str" } } self.config["hyperparameter"] = hyperparameter ProjectManager.set_config(self.config) uc = randomforest_usecase() uc.run(save=True) res, best = uc.get_results() print(best) - # def test_lda_usecase(self): - # hyperparameter = { - # "solver": { - # "domain": "categorical", - # "data": ["svd", "lsqr", "eigen"], - # "type": "str" - # }, - # "tol": { - # "domain": "uniform", - # "data": [0.00000001, 1.0], - # "type": "float" - # } - # } - # - # self.config["hyperparameter"] = hyperparameter - # ProjectManager.set_config(self.config) - # uc = lda_usecase() - # uc.run(save=True) - # res, best = uc.get_results() - # print(best) + def test_lda_usecase(self): + hyperparameter = { + "solver": { + "domain": "categorical", + "data": ["svd", "lsqr", "eigen"], + "type": "str" + }, + "tol": { + "domain": "uniform", + "data": [0.00000001, 1.0], + "type": "float" + } + } + + self.config["hyperparameter"] = hyperparameter + ProjectManager.set_config(self.config) + uc = lda_usecase() + uc.run(save=True) + res, best = uc.get_results() + print(best) def test_knc_usecase(self): hyperparameter = { "n_neighbors": { "domain": "uniform", "data": [1, 100], "type": "int" }, "weights": { "domain": "categorical", "data": ["uniform", "distance"], "type": "str" }, "algorithm": { "domain": "categorical", "data": ["auto", "ball_tree", "kd_tree", "brute"], "type": "str" } } self.config["hyperparameter"] = hyperparameter ProjectManager.set_config(self.config) uc = knc_usecase() uc.run(save=True) res, best = uc.get_results() print(best) def tearDown(self): - if os.path.isdir(self.root): - shutil.rmtree(self.root) + pass + # if os.path.isdir(self.root): + # shutil.rmtree(self.root) if __name__ == '__main__': unittest.main() diff --git a/hyppopy/tests/test_workflows.py b/hyppopy/tests/test_workflows.py index 508e946..f8783d6 100644 --- a/hyppopy/tests/test_workflows.py +++ b/hyppopy/tests/test_workflows.py @@ -1,82 +1,120 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import unittest from hyppopy.globals import TESTDATA_DIR IRIS_DATA = os.path.join(TESTDATA_DIR, 'Iris') TITANIC_DATA = os.path.join(TESTDATA_DIR, 'Titanic') from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase class WorkflowTestSuite(unittest.TestCase): def setUp(self): self.results = [] def test_workflow_svc_on_iris_from_xml(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'svc_config.xml')) uc = svc_usecase() uc.run(False) res, best = uc.get_results() - self.assertEqual(len(best.keys()), 4) + self.assertTrue('C' in res.columns) + self.assertTrue('gamma' in res.columns) + self.assertTrue('kernel' in res.columns) + self.assertEqual(len(best.keys()), 3) - def test_workflow_rf_on_iris_from_xml(self): - ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_config.xml')) + def test_workflow_svc_on_iris_from_json(self): + ProjectManager.read_config(os.path.join(IRIS_DATA, 'svc_config.json')) uc = svc_usecase() uc.run(False) res, best = uc.get_results() - self.assertEqual(len(best.keys()), 6) + self.assertTrue('C' in res.columns) + self.assertTrue('gamma' in res.columns) + self.assertTrue('kernel' in res.columns) + self.assertEqual(len(best.keys()), 3) - def test_workflow_svc_on_iris_from_json(self): - ProjectManager.read_config(os.path.join(IRIS_DATA, 'svc_config.json')) - uc = svc_usecase() + def test_workflow_rf_on_iris_from_xml(self): + ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_config.xml')) + uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() - self.assertEqual(len(best.keys()), 4) + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('criterion' in res.columns) + self.assertTrue('max_depth' in res.columns) + self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_iris_from_json(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_config.json')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() - self.assertEqual(len(best.keys()), 6) + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('criterion' in res.columns) + self.assertTrue('max_depth' in res.columns) + self.assertEqual(len(best.keys()), 3) + + # def test_workflow_svc_on_titanic_from_xml(self): + # ProjectManager.read_config(os.path.join(TITANIC_DATA, 'svc_config.xml')) + # uc = svc_usecase() + # uc.run(False) + # res, best = uc.get_results() + # self.assertTrue('C' in res.columns) + # self.assertTrue('gamma' in res.columns) + # self.assertTrue('kernel' in res.columns) + # self.assertEqual(len(best.keys()), 3) + # + # def test_workflow_svc_on_titanic_from_json(self): + # ProjectManager.read_config(os.path.join(TITANIC_DATA, 'svc_config.json')) + # uc = svc_usecase() + # uc.run(False) + # res, best = uc.get_results() + # self.assertTrue('C' in res.columns) + # self.assertTrue('gamma' in res.columns) + # self.assertTrue('kernel' in res.columns) + # self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_titanic_from_xml(self): ProjectManager.read_config(os.path.join(TITANIC_DATA, 'rf_config.xml')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('criterion' in res.columns) + self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_titanic_from_json(self): ProjectManager.read_config(os.path.join(TITANIC_DATA, 'rf_config.json')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('criterion' in res.columns) + self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) def tearDown(self): print("") for r in self.results: print(r) if __name__ == '__main__': unittest.main() diff --git a/hyppopy/workflows/lda_usecase/__init__.py b/hyppopy/workflows/adaboost_usecase/__init__.py similarity index 100% rename from hyppopy/workflows/lda_usecase/__init__.py rename to hyppopy/workflows/adaboost_usecase/__init__.py diff --git a/hyppopy/workflows/lda_usecase/lda_usecase.py b/hyppopy/workflows/adaboost_usecase/adaboost_usecase.py similarity index 81% rename from hyppopy/workflows/lda_usecase/lda_usecase.py rename to hyppopy/workflows/adaboost_usecase/adaboost_usecase.py index 2d8e646..5b904ca 100644 --- a/hyppopy/workflows/lda_usecase/lda_usecase.py +++ b/hyppopy/workflows/adaboost_usecase/adaboost_usecase.py @@ -1,35 +1,36 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import cross_val_score from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.workflowbase import WorkflowBase from hyppopy.workflows.dataloader.simpleloader import SimpleDataLoader -class lda_usecase(WorkflowBase): +class adaboost_usecase(WorkflowBase): def setup(self, **kwargs): dl = SimpleDataLoader() dl.start(path=ProjectManager.data_path, data_name=ProjectManager.data_name, labels_name=ProjectManager.labels_name) self.solver.set_data(dl.data) def blackbox_function(self, data, params): - clf = LinearDiscriminantAnalysis(**params) + clf = AdaBoostClassifier(n_estimators=params['n_estimators'], + learning_rate=params['learning_rate']) return -cross_val_score(estimator=clf, X=data[0], y=data[1], cv=3).mean() diff --git a/hyppopy/workflows/svc_usecase/svc_usecase.py b/hyppopy/workflows/svc_usecase/svc_usecase.py index 0e63f3f..f1ba78a 100644 --- a/hyppopy/workflows/svc_usecase/svc_usecase.py +++ b/hyppopy/workflows/svc_usecase/svc_usecase.py @@ -1,35 +1,60 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) from sklearn.svm import SVC from sklearn.model_selection import cross_val_score from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.workflowbase import WorkflowBase from hyppopy.workflows.dataloader.simpleloader import SimpleDataLoader class svc_usecase(WorkflowBase): def setup(self, **kwargs): dl = SimpleDataLoader() dl.start(path=ProjectManager.data_path, data_name=ProjectManager.data_name, labels_name=ProjectManager.labels_name) self.solver.set_data(dl.data) def blackbox_function(self, data, params): - clf = SVC(**params) + if 'C' not in params.keys(): + print("Warning: missing parameter C, use default value 1.0!") + params['C'] = 1.0 + if 'kernel' not in params.keys(): + print("Warning: missing parameter kernel, use default value linear!") + params['kernel'] = 'linear' + + if params['kernel'] == 'linear': + clf = SVC(kernel='linear', C=params['C']) + elif params['kernel'] == 'poly': + if 'degree' not in params.keys(): + print("Warning: missing parameter degree, use default value 3!") + params['degree'] = 3 + if 'coef0' not in params.keys(): + print("Warning: missing parameter coef0, use default value 0.0!") + params['coef0'] = 0.0 + clf = SVC(kernel='poly', C=params['C'], degree=params['degree'], coef0=params['coef0']) + elif params['kernel'] == 'rbf': + if 'gamma' not in params.keys(): + print("Warning: missing parameter gamma, use default value 'auto'!") + params['gamma'] = 'scale' + clf = SVC(kernel='rbf', C=params['C'], gamma=params['gamma']) + else: + raise IOError("Unknown kernel function: %s".format(params['kernel'])) + return -cross_val_score(estimator=clf, X=data[0], y=data[1], cv=3).mean() + diff --git a/requirements.txt b/requirements.txt index 444d9f3..d5e39f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,15 @@ dicttoxml>=1.7.4 xmltodict>=0.11.0 hyperopt>=0.1.1 Optunity>=1.1.1 numpy>=1.16.0 matplotlib>=3.0.2 scikit-learn>=0.20.2 scipy>=1.2.0 Sphinx>=1.8.3 xmlrunner>=1.7.7 Yapsy>=1.11.223 pandas>=0.24.1 -seaborn>=0.9.0 \ No newline at end of file +seaborn>=0.9.0 +deap>=1.2.2 +bayesian-optimization>=1.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 59b01a5..36f0629 100644 --- a/setup.py +++ b/setup.py @@ -1,61 +1,63 @@ # -*- coding: utf-8 -*- import os from setuptools import setup, find_packages with open('README.rst') as f: readme = f.read() with open('LICENSE') as f: license = f.read() VERSION = "0.1.2dev" ROOT = os.path.dirname(os.path.realpath(__file__)) new_init = [] with open(os.path.join(ROOT, *("hyppopy", "__init__.py")), "r") as infile: for line in infile: new_init.append(line) for n in range(len(new_init)): if new_init[n].startswith("__version__"): split = line.split("=") new_init[n] = "__version__ = '" + VERSION + "'\n" with open(os.path.join(ROOT, *("hyppopy", "__init__.py")), "w") as outfile: outfile.writelines(new_init) setup( name='hyppopy', version=VERSION, description='Hyper-Parameter Optimization Toolbox for Blackboxfunction Optimization', long_description=readme, # if you want, put your own name here # (this would likely result in people sending you emails) author='Sven Wanner', author_email='s.wanner@dkfz.de', url='', license=license, packages=find_packages(exclude=('*test*', 'doc')), package_data={ 'hyppopy.plugins': ['*.yapsy-plugin'] }, # the requirements to install this project. # Since this one is so simple this is empty. install_requires=[ 'dicttoxml>=1.7.4', 'xmltodict>=0.11.0', 'hyperopt>=0.1.1', 'Optunity>=1.1.1', 'numpy>=1.16.0', 'matplotlib>=3.0.2', 'scikit-learn>=0.20.2', 'scipy>=1.2.0', 'Sphinx>=1.8.3', 'xmlrunner>=1.7.7', 'Yapsy>=1.11.223', 'pandas>=0.24.1', - 'seaborn>=0.9.0' + 'seaborn>=0.9.0', + 'deap>=1.2.2', + 'bayesian-optimization>=1.0.1' ], )