diff --git a/__main__.py b/__main__.py index dae4064..f1d65ba 100644 --- a/__main__.py +++ b/__main__.py @@ -1,95 +1,91 @@ #!/usr/bin/env python # # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import sys +import time +import argparse ROOT = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(ROOT) from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase -from hyppopy.workflows.lda_usecase.adaboost_usecase import lda_usecase +from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase from hyppopy.workflows.unet_usecase.unet_usecase import unet_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase from hyppopy.workflows.imageregistration_usecase.imageregistration_usecase import imageregistration_usecase -import os -import sys -import time -import argparse - - def print_warning(msg): print("\n!!!!! WARNING !!!!!") print(msg) sys.exit() def args_check(args): if not args.workflow: print_warning("No workflow specified, check --help") if not args.config: print_warning("Missing config parameter, check --help") if not os.path.isfile(args.config): print_warning(f"Couldn't find configfile ({args.config}), please check your input --config") if __name__ == "__main__": parser = argparse.ArgumentParser(description='UNet Hyppopy UseCase Example Optimization.') parser.add_argument('-w', '--workflow', type=str, help='workflow to be executed') parser.add_argument('-o', '--output', type=str, default=None, help='output path to store result') parser.add_argument('-c', '--config', type=str, help='config filename, .xml or .json formats are supported.' 'pass a full path filename or the filename only if the' 'configfile is in the data folder') args = parser.parse_args() args_check(args) ProjectManager.read_config(args.config) if args.output is not None: ProjectManager.register_member("output_dir", args.output) if args.workflow == "svc_usecase": uc = svc_usecase() elif args.workflow == "randomforest_usecase": uc = randomforest_usecase() elif args.workflow == "knc_usecase": uc = knc_usecase() - elif args.workflow == "lda_usecase": - uc = lda_usecase() + elif args.workflow == "adaboost_usecase": + uc = adaboost_usecase() elif args.workflow == "unet_usecase": uc = unet_usecase() elif args.workflow == "imageregistration_usecase": uc = imageregistration_usecase() else: print("No workflow called {} found!".format(args.workflow)) sys.exit() print("\nStart optimization...") start = time.process_time() uc.run(save=True) end = time.process_time() print("Finished optimization!\n") print("Total Time: {}s\n".format(end-start)) res, best = uc.get_results() print("---- Optimal Parameter -----\n") for p in best.items(): print(" - {}\t:\t{}".format(p[0], p[1])) diff --git a/hyppopy/helpers.py b/hyppopy/helpers.py new file mode 100644 index 0000000..3fc9cf0 --- /dev/null +++ b/hyppopy/helpers.py @@ -0,0 +1,99 @@ +import copy +import itertools +from collections import OrderedDict, abc + + +class NestedDictUnfolder(object): + + def __init__(self, nested_dict): + self._nested_dict = nested_dict + self._categories = [] + self._values = OrderedDict() + self._tree_leafs = [] + + NestedDictUnfolder.nested_dict_iter(self._nested_dict, self) + + @staticmethod + def nested_dict_iter(nested, unfolder): + for key, value in nested.items(): + if isinstance(value, abc.Mapping): + unfolder.add_category(key) + NestedDictUnfolder.nested_dict_iter(value, unfolder) + else: + unfolder.add_values(key, value) + unfolder.mark_leaf() + + def find_parent_nodes(self, nested, node, last_node=""): + for key, value in nested.items(): + if key == node: + self._tree_leafs.append(last_node) + return + else: + last_node = key + if isinstance(value, abc.Mapping): + self.find_parent_nodes(value, node, last_node) + else: + return + + def find_parent_node(self, leaf_names): + if not isinstance(leaf_names, list): + leaf_names = [leaf_names] + for ln in leaf_names: + try: + pos = self._categories.index(ln) - 1 + candidate = self._categories[pos] + if candidate not in leaf_names: + return candidate + except: + pass + return None + + def add_category(self, name): + self._categories.append(name) + + def add_values(self, name, values): + self._values[name] = values + + def mark_leaf(self): + if len(self._categories) > 0: + if not self._categories[-1] in self._tree_leafs: + self._tree_leafs.append(self._categories[-1]) + + def permutate_values(self): + pset = list(self._values.values()) + pset = list(itertools.product(*pset)) + permutations = [] + okeys = list(self._values.keys()) + for ps in pset: + permutations.append({}) + for i in range(len(okeys)): + permutations[-1][okeys[i]] = ps[i] + return permutations + + def add_categories(self, values_permutated): + while True: + parent = self.find_parent_node(self._tree_leafs) + if parent is None: + return + result = [] + for tl in self._tree_leafs: + for elem in values_permutated: + new = copy.deepcopy(elem) + new[parent] = tl + result.append(new) + while tl in self._categories: + self._categories.remove(tl) + while parent in self._categories: + self._categories.remove(parent) + self._tree_leafs = [] + self.find_parent_nodes(self._nested_dict, parent) + if len(self._tree_leafs) == 1 and self._tree_leafs[0] == "": + break + values_permutated = copy.deepcopy(result) + return result + + def unfold(self): + values_permutated = self.permutate_values() + if len(self._categories) > 0: + return self.add_categories(values_permutated) + return values_permutated diff --git a/hyppopy/plugins/gridsearch_settings_plugin.py b/hyppopy/plugins/gridsearch_settings_plugin.py index 6fb7062..94e51e2 100644 --- a/hyppopy/plugins/gridsearch_settings_plugin.py +++ b/hyppopy/plugins/gridsearch_settings_plugin.py @@ -1,143 +1,146 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import logging import numpy as np from pprint import pformat from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) from yapsy.IPlugin import IPlugin from hyppopy.settingspluginbase import SettingsPluginBase from hyppopy.settingsparticle import split_categorical from hyppopy.settingsparticle import SettingsParticle def gaussian(x, mu, sigma): return 1.0/(sigma * np.sqrt(2*np.pi))*np.exp(-(x-mu)**2/(2*sigma**2)) def gaussian_axis_sampling(a, b, N): center = a + (b - a) / 2.0 delta = (b - a) / N bn = b - center xn = np.arange(0, bn, delta) dn = [] for x in xn: dn.append(1/gaussian(x, 0, bn/2.5)) dn = np.array(dn) dn /= np.sum(dn) dn *= bn axis = [0] for x in dn: axis.append(x+axis[-1]) axis.insert(0, -axis[-1]) axis = np.array(axis) axis += center return axis def log_axis_sampling(a, b, N): delta = (b - a) / N logrange = np.arange(a, b + delta, delta) for n in range(logrange.shape[0]): logrange[n] = np.exp(logrange[n]) return logrange def sample(start, stop, count, ftype="uniform"): assert stop > start, "Precondition Violation, stop <= start not allowed!" assert count > 0, "Precondition Violation, N <= 0 not allowed!" if ftype == 'uniform': delta = (stop - start)/count return np.arange(start, stop + delta, delta) elif ftype == 'loguniform': return log_axis_sampling(start, stop, count) elif ftype == 'normal': return gaussian_axis_sampling(start, stop, count) raise IOError("Precondition Violation, unknown sampling function type!") class gridsearch_Settings(SettingsPluginBase, IPlugin): def __init__(self): SettingsPluginBase.__init__(self) LOG.debug("initialized") def convert_parameter(self, input_dict): LOG.debug("convert input parameter\n\n\t{}\n".format(pformat(input_dict))) solution_space = {} # split input in categorical and non-categorical data cat, uni = split_categorical(input_dict) # build up dictionary keeping all non-categorical data uniforms = {} for name, content in uni.items(): particle = gridsearch_SettingsParticle(name=name) for key, value in content.items(): if key == 'domain': particle.domain = value elif key == 'data': particle.data = value elif key == 'type': particle.dtype = value uniforms[name] = particle.get() # build nested categorical structure inner_level = uniforms for key, value in cat.items(): tmp = {} tmp2 = {} for key2, value2 in value.items(): if key2 == 'data': for elem in value2: tmp[elem] = inner_level tmp2[key] = tmp inner_level = tmp2 - solution_space = tmp2 + if len(cat) > 0: + solution_space = tmp2 + else: + solution_space = inner_level return solution_space class gridsearch_SettingsParticle(SettingsParticle): def __init__(self, name=None, domain=None, dtype=None, data=None): SettingsParticle.__init__(self, name, domain, dtype, data) def convert(self): assert isinstance(self.data, list), "Precondition Violation, invalid input type for data!" if self.domain == "categorical": return self.data else: assert len(self.data) >= 2, "Precondition Violation, invalid input data!" if len(self.data) < 3: self.data.append(10) LOG.warning("Grid sampling has set number of samples automatically to 10!") print("WARNING: Grid sampling has set number of samples automatically to 10!") samples = sample(start=self.data[0], stop=self.data[1], count=self.data[2], ftype=self.domain) if self.dtype == "int": data = [] for s in samples: val = int(np.round(s)) if len(data) > 0: if val == data[-1]: continue data.append(val) return data return list(samples) diff --git a/hyppopy/plugins/gridsearch_solver_plugin.py b/hyppopy/plugins/gridsearch_solver_plugin.py index 91dda89..66e474d 100644 --- a/hyppopy/plugins/gridsearch_solver_plugin.py +++ b/hyppopy/plugins/gridsearch_solver_plugin.py @@ -1,84 +1,140 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os +import time import logging +from numpy import argmin, argmax, unique from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) from pprint import pformat from yapsy.IPlugin import IPlugin -from sklearn.model_selection import GridSearchCV -from hyppopy.projectmanager import ProjectManager +from hyppopy.helpers import NestedDictUnfolder from hyppopy.solverpluginbase import SolverPluginBase +class Trials(object): + + def __init__(self): + self.loss = [] + self.duration = [] + self.status = [] + self.parameter = [] + self.best = None + self._tick = None + + def start_iteration(self): + self._tick = time.process_time() + + def stop_iteration(self): + if self._tick is None: + return + self.duration.append(time.process_time()-self._tick) + self._tick = None + + def set_status(self, status=True): + self.status.append(status) + + def set_parameter(self, params): + self.parameter.append(params) + + def set_loss(self, value): + self.loss.append(value) + + def get(self): + if len(self.loss) <= 0: + raise Exception("Empty solver results!") + if len(self.loss) != len(self.duration) or len(self.loss) != len(self.parameter) or len(self.loss) != len(self.status): + raise Exception("Inconsistent results in gridsearch solver!") + best_index = argmin(self.loss) + best = self.parameter[best_index] + worst_loss = self.loss[argmax(self.loss)] + for n in range(len(self.status)): + if not self.status[n]: + self.loss[n] = worst_loss + + res = { + 'losses': self.loss, + 'duration': self.duration + } + is_string = [] + for key, value in self.parameter[0].items(): + res[key] = [] + if isinstance(value, str): + is_string.append(key) + + for p in self.parameter: + for key, value in p.items(): + res[key].append(value) + + for key in is_string: + uniques = unique(res[key]) + lookup = {} + for n, p in enumerate(uniques): + lookup[p] = n + for n in range(len(res[key])): + res[key][n] = lookup[res[key][n]] + + return res, best + + class gridsearch_Solver(SolverPluginBase, IPlugin): trials = None best = None def __init__(self): SolverPluginBase.__init__(self) LOG.debug("initialized") def blackbox_function(self, params): - pass - # status = STATUS_FAIL - # try: - # loss = self.blackbox_function_template(self.data, params) - # if loss is not None: - # status = STATUS_OK - # except Exception as e: - # LOG.error("execution of self.loss(self.data, params) failed due to:\n {}".format(e)) - # status = STATUS_FAIL - # return {'loss': loss, 'status': status} + loss = None + self.trials.set_parameter(params) + try: + self.trials.start_iteration() + loss = self.blackbox_function_template(self.data, params) + self.trials.stop_iteration() + if loss is None: + self.trials.set_status(False) + except Exception as e: + LOG.error("execution of self.loss(self.data, params) failed due to:\n {}".format(e)) + self.trials.set_status(False) + self.trials.stop_iteration() + self.trials.set_status(True) + self.trials.set_loss(loss) + return def execute_solver(self, parameter): - pass - # LOG.debug("execute_solver using solution space:\n\n\t{}\n".format(pformat(parameter))) - # self.trials = Trials() - # - # try: - # self.best = fmin(fn=self.blackbox_function, - # space=parameter, - # algo=tpe.suggest, - # max_evals=ProjectManager.max_iterations, - # trials=self.trials) - # except Exception as e: - # msg = "internal error in hyperopt.fmin occured. {}".format(e) - # LOG.error(msg) - # raise BrokenPipeError(msg) + LOG.debug("execute_solver using solution space:\n\n\t{}\n".format(pformat(parameter))) + + self.trials = Trials() + unfolder = NestedDictUnfolder(parameter) + parameter_set = unfolder.unfold() + N = len(parameter_set) + print("") + try: + for n, params in enumerate(parameter_set): + self.blackbox_function(params) + print("\r{}% done".format(int(round(100.0/N*n))), end="") + except Exception as e: + msg = "internal error in gridsearch execute_solver occured. {}".format(e) + LOG.error(msg) + raise BrokenPipeError(msg) + print("") def convert_results(self): - pass - # currently converting results in a way that this function returns a dict - # keeping all useful parameter as key/list item. This will be automatically - # converted to a pandas dataframe in the solver class - # results = {'duration': [], 'losses': []} - # pset = self.trials.trials[0]['misc']['vals'] - # for p in pset.keys(): - # results[p] = [] - # - # for n, trial in enumerate(self.trials.trials): - # t1 = trial['book_time'] - # t2 = trial['refresh_time'] - # results['duration'].append((t2 - t1).microseconds/1000.0) - # results['losses'].append(trial['result']['loss']) - # pset = trial['misc']['vals'] - # for p in pset.items(): - # results[p[0]].append(p[1][0]) - # return results, self.best + return self.trials.get() diff --git a/hyppopy/plugins/randomsearch_settings_plugin.py b/hyppopy/plugins/randomsearch_settings_plugin.py deleted file mode 100644 index 782552b..0000000 --- a/hyppopy/plugins/randomsearch_settings_plugin.py +++ /dev/null @@ -1,100 +0,0 @@ -# DKFZ -# -# -# Copyright (c) German Cancer Research Center, -# Division of Medical and Biological Informatics. -# All rights reserved. -# -# This software is distributed WITHOUT ANY WARRANTY; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. -# -# See LICENSE.txt or http://www.mitk.org for details. -# -# Author: Sven Wanner (s.wanner@dkfz.de) - -import os -import logging -import numpy as np -from hyppopy.globals import DEBUGLEVEL -LOG = logging.getLogger(os.path.basename(__file__)) -LOG.setLevel(DEBUGLEVEL) - -from pprint import pformat -from yapsy.IPlugin import IPlugin - - -from hyppopy.settingspluginbase import SettingsPluginBase -from hyppopy.settingsparticle import SettingsParticle - - -class randomsearch_Settings(SettingsPluginBase, IPlugin): - - def __init__(self): - SettingsPluginBase.__init__(self) - LOG.debug("initialized") - - def convert_parameter(self, input_dict): - pass - # LOG.debug("convert input parameter\n\n\t{}\n".format(pformat(input_dict))) - # - # solution_space = {} - # for name, content in input_dict.items(): - # particle = hyperopt_SettingsParticle(name=name) - # for key, value in content.items(): - # if key == 'domain': - # particle.domain = value - # elif key == 'data': - # particle.data = value - # elif key == 'type': - # particle.dtype = value - # solution_space[name] = particle.get() - # return solution_space - - -class randomsearch_SettingsParticle(SettingsParticle): - - def __init__(self, name=None, domain=None, dtype=None, data=None): - SettingsParticle.__init__(self, name, domain, dtype, data) - - def convert(self): - pass - # if self.domain == "uniform": - # if self.dtype == "float" or self.dtype == "double": - # return hp.uniform(self.name, self.data[0], self.data[1]) - # elif self.dtype == "int": - # data = list(np.arange(int(self.data[0]), int(self.data[1]+1))) - # return hp.choice(self.name, data) - # else: - # msg = "cannot convert the type {} in domain {}".format(self.dtype, self.domain) - # LOG.error(msg) - # raise LookupError(msg) - # elif self.domain == "loguniform": - # if self.dtype == "float" or self.dtype == "double": - # return hp.loguniform(self.name, self.data[0], self.data[1]) - # else: - # msg = "cannot convert the type {} in domain {}".format(self.dtype, self.domain) - # LOG.error(msg) - # raise LookupError(msg) - # elif self.domain == "normal": - # if self.dtype == "float" or self.dtype == "double": - # return hp.normal(self.name, self.data[0], self.data[1]) - # else: - # msg = "cannot convert the type {} in domain {}".format(self.dtype, self.domain) - # LOG.error(msg) - # raise LookupError(msg) - # elif self.domain == "categorical": - # if self.dtype == 'str': - # return hp.choice(self.name, self.data) - # elif self.dtype == 'bool': - # data = [] - # for elem in self.data: - # if elem == "true" or elem == "True" or elem == 1 or elem == "1": - # data .append(True) - # elif elem == "false" or elem == "False" or elem == 0 or elem == "0": - # data .append(False) - # else: - # msg = "cannot convert the type {} in domain {}, unknown bool type value".format(self.dtype, self.domain) - # LOG.error(msg) - # raise LookupError(msg) - # return hp.choice(self.name, data) diff --git a/hyppopy/plugins/randomsearch_settings_plugin.yapsy-plugin b/hyppopy/plugins/randomsearch_settings_plugin.yapsy-plugin deleted file mode 100644 index 27d25fd..0000000 --- a/hyppopy/plugins/randomsearch_settings_plugin.yapsy-plugin +++ /dev/null @@ -1,9 +0,0 @@ -[Core] -Name = randomsearch -Module = randomsearch_settings_plugin - -[Documentation] -Author = Sven Wanner -Version = 0.1 -Website = https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html -Description = RandomSearch Settings Plugin \ No newline at end of file diff --git a/hyppopy/plugins/randomsearch_solver_plugin.py b/hyppopy/plugins/randomsearch_solver_plugin.py deleted file mode 100644 index 03b3f41..0000000 --- a/hyppopy/plugins/randomsearch_solver_plugin.py +++ /dev/null @@ -1,84 +0,0 @@ -# DKFZ -# -# -# Copyright (c) German Cancer Research Center, -# Division of Medical and Biological Informatics. -# All rights reserved. -# -# This software is distributed WITHOUT ANY WARRANTY; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. -# -# See LICENSE.txt or http://www.mitk.org for details. -# -# Author: Sven Wanner (s.wanner@dkfz.de) - -import os -import logging -from hyppopy.globals import DEBUGLEVEL -LOG = logging.getLogger(os.path.basename(__file__)) -LOG.setLevel(DEBUGLEVEL) - -from pprint import pformat -from yapsy.IPlugin import IPlugin - - -from hyppopy.projectmanager import ProjectManager -from hyppopy.solverpluginbase import SolverPluginBase - - -class randomsearch_Solver(SolverPluginBase, IPlugin): - trials = None - best = None - - def __init__(self): - SolverPluginBase.__init__(self) - LOG.debug("initialized") - - def blackbox_function(self, params): - pass - # status = STATUS_FAIL - # try: - # loss = self.blackbox_function_template(self.data, params) - # if loss is not None: - # status = STATUS_OK - # except Exception as e: - # LOG.error("execution of self.loss(self.data, params) failed due to:\n {}".format(e)) - # status = STATUS_FAIL - # return {'loss': loss, 'status': status} - - def execute_solver(self, parameter): - pass - # LOG.debug("execute_solver using solution space:\n\n\t{}\n".format(pformat(parameter))) - # self.trials = Trials() - # - # try: - # self.best = fmin(fn=self.blackbox_function, - # space=parameter, - # algo=tpe.suggest, - # max_evals=ProjectManager.max_iterations, - # trials=self.trials) - # except Exception as e: - # msg = "internal error in hyperopt.fmin occured. {}".format(e) - # LOG.error(msg) - # raise BrokenPipeError(msg) - - def convert_results(self): - pass - # currently converting results in a way that this function returns a dict - # keeping all useful parameter as key/list item. This will be automatically - # converted to a pandas dataframe in the solver class - # results = {'duration': [], 'losses': []} - # pset = self.trials.trials[0]['misc']['vals'] - # for p in pset.keys(): - # results[p] = [] - # - # for n, trial in enumerate(self.trials.trials): - # t1 = trial['book_time'] - # t2 = trial['refresh_time'] - # results['duration'].append((t2 - t1).microseconds/1000.0) - # results['losses'].append(trial['result']['loss']) - # pset = trial['misc']['vals'] - # for p in pset.items(): - # results[p[0]].append(p[1][0]) - # return results, self.best diff --git a/hyppopy/plugins/randomsearch_solver_plugin.yapsy-plugin b/hyppopy/plugins/randomsearch_solver_plugin.yapsy-plugin deleted file mode 100644 index e465d93..0000000 --- a/hyppopy/plugins/randomsearch_solver_plugin.yapsy-plugin +++ /dev/null @@ -1,9 +0,0 @@ -[Core] -Name = randomsearch -Module = randomsearch_solver_plugin - -[Documentation] -Author = Sven Wanner -Version = 0.1 -Website = https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html -Description = RandomSearch Solver Plugin \ No newline at end of file diff --git a/hyppopy/resultviewer.py b/hyppopy/resultviewer.py index d39c640..a718c52 100644 --- a/hyppopy/resultviewer.py +++ b/hyppopy/resultviewer.py @@ -1,87 +1,174 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os +import copy import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import logging from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) sns.set(style="darkgrid") class ResultViewer(object): def __init__(self, fname=None, save_only=False): self.df = None self.has_duration = False self.hyperparameter = None self.save_only = save_only self.path = None self.appendix = None if fname is not None: self.read(fname) def close_all(self): plt.close('all') def read(self, fname): self.path = os.path.dirname(fname) split = os.path.basename(fname).split("_") self.appendix = split[-1] self.appendix = self.appendix[:-4] self.df = pd.read_csv(fname, index_col=0) const_data = ["duration", "losses"] hyperparameter_columns = [item for item in self.df.columns if item not in const_data] self.hyperparameter = pd.DataFrame() for key in hyperparameter_columns: self.hyperparameter[key] = self.df[key] self.has_duration = "duration" in self.df.columns - def show(self, save=True): - if self.has_duration: - sns_plot = sns.jointplot(y="duration", x="losses", data=self.df, kind="kde") - if not self.save_only: - plt.show() + def plot_XYGrid(self, df, x, y, name="", save=None, show=True): + argmin = df["losses"].idxmin() + grid = [len(x), len(y)] + if grid[0] == 1 and grid[1] == 1: + fig = plt.figure(figsize=(10.0, 8)) + plt.plot(df[x[0]].values, df[y[0]].values, 'o') + plt.plot(df[x[0]].values[argmin], df[y[0]].values[argmin], 'ro') + plt.grid(True) + plt.ylabel(y[0]) + plt.xlabel(x[0]) + plt.title(name, fontsize=16) + else: + if grid[0] > 1 and grid[1] == 1: + fig, axs = plt.subplots(ncols=grid[0], figsize=(10.0, grid[1] * 3.5)) + elif grid[0] == 1 and grid[1] > 1: + fig, axs = plt.subplots(nrows=grid[1], figsize=(10.0, grid[1] * 3.5)) + else: + fig, axs = plt.subplots(nrows=grid[1], ncols=grid[0], figsize=(10.0, grid[1] * 3.5)) + fig.subplots_adjust(left=0.08, right=0.98, wspace=0.3) + + for nx, _x in enumerate(x): + for ny, _y in enumerate(y): + if grid[0] > 1 and grid[1] == 1: + ax = axs[nx] + elif grid[0] == 1 and grid[1] > 1: + ax = axs[ny] + else: + ax = axs[ny, nx] + ax.plot(df[_x].values, df[_y].values, 'o') + ax.plot(df[_x].values[argmin], df[_y].values[argmin], 'ro') + ax.grid(True) + if nx == 0: + ax.set_ylabel(_y) + if ny == len(y)-1: + ax.set_xlabel(_x) + fig.suptitle(name, fontsize=16) + if save is not None: + if not os.path.isdir(os.path.dirname(save)): + os.makedirs(os.path.dirname(save)) + plt.savefig(save) + if show: + plt.show() + + def plot_performance_and_feature_grids(self, save=True): + x_axis = [] + if 'losses' in self.df.columns: + x_axis.append('losses') + if 'iterations' in self.df.columns: + x_axis.append('iterations') + y_axis_performance = [] + if 'accuracy' in self.df.columns: + y_axis_performance.append('accuracy') + if 'duration' in self.df.columns: + y_axis_performance.append('duration') + features = [] + for cit in self.df.columns: + if cit not in x_axis and cit not in y_axis_performance: + features.append(cit) + + save_name = None + if save: + save_name = os.path.join(self.path, "performance" + self.appendix + ".png") + self.plot_XYGrid(self.df, x=x_axis, + y=y_axis_performance, + name="Performance", + save=save_name, + show=not self.save_only) + + chunks = [features[x:x + 3] for x in range(0, len(features), 3)] + for n, chunk in enumerate(chunks): + save_name = None if save: - save_name = os.path.join(self.path, "t_vs_loss_"+self.appendix+".png") - try: - sns_plot.savefig(save_name) - except Exception as e: - msg = "failed to save file {}, reason {}".format(save_name, e) - LOG.error(msg) - raise IOError(msg) + save_name = os.path.join(self.path, "features_{}_".format(str(n).zfill(3)) + self.appendix + ".png") + self.plot_XYGrid(self.df, x=x_axis, + y=chunk, + name="Feature set {}".format(n+1), + save=save_name, + show=not self.save_only) + + def plot_feature_matrix(self, save=True): sns_plot = sns.pairplot(self.df, height=1.8, aspect=1.8, plot_kws=dict(edgecolor="k", linewidth=0.5), diag_kind="kde", diag_kws=dict(shade=True)) fig = sns_plot.fig fig.subplots_adjust(top=0.93, wspace=0.3) t = fig.suptitle('Pairwise Plots', fontsize=14) if not self.save_only: plt.show() if save: save_name = os.path.join(self.path, "matrixview_"+self.appendix+".png") try: sns_plot.savefig(save_name) except Exception as e: msg = "failed to save file {}, reason {}".format(save_name, e) LOG.error(msg) raise IOError(msg) + def plot_duration(self, save=True): + if "duration" in self.df.columns: + sns_plot = sns.jointplot(y="duration", x="losses", data=self.df, kind="kde") + if not self.save_only: + plt.show() + if save: + save_name = os.path.join(self.path, "t_vs_loss_" + self.appendix + ".png") + try: + sns_plot.savefig(save_name) + except Exception as e: + msg = "failed to save file {}, reason {}".format(save_name, e) + LOG.error(msg) + raise IOError(msg) + + def show(self, save=True): + self.plot_duration(save) + self.plot_feature_matrix(save) + self.plot_performance_and_feature_grids(save) + diff --git a/hyppopy/tests/data/Titanic/lda_config.xml b/hyppopy/tests/data/Iris/adaboost_config.xml similarity index 73% copy from hyppopy/tests/data/Titanic/lda_config.xml copy to hyppopy/tests/data/Iris/adaboost_config.xml index 556ff45..48b7b88 100644 --- a/hyppopy/tests/data/Titanic/lda_config.xml +++ b/hyppopy/tests/data/Iris/adaboost_config.xml @@ -1,26 +1,26 @@ - - categorical - [svd,lsqr,eigen] - str - - + uniform - [0.0,1.0] + [1, 100] + int + + + loguniform + [-10,3] float - + 3 hyperopt D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic train_cleaned.csv Survived \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/knc_config.xml b/hyppopy/tests/data/Iris/knc_config.xml index eb253bb..8407a90 100644 --- a/hyppopy/tests/data/Iris/knc_config.xml +++ b/hyppopy/tests/data/Iris/knc_config.xml @@ -1,36 +1,36 @@ uniform [1,50] int - + uniform [1,100] int - + categorical [uniform,distance] str - + categorical [auto,ball_tree,kd_tree,brute] str - + 3 hyperopt D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris train_data.npy train_labels.npy \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/lda_config.xml b/hyppopy/tests/data/Iris/lda_config.xml deleted file mode 100644 index b6b2fc4..0000000 --- a/hyppopy/tests/data/Iris/lda_config.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - categorical - [svd,lsqr,eigen] - str - - - uniform - [0.0,1.0] - float - - - - - 3 - hyperopt - D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris - - - D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris - train_cleaned.csv - Survived - - - \ No newline at end of file diff --git a/hyppopy/tests/data/Iris/knc_config.xml b/hyppopy/tests/data/Iris/rf_grid_config.xml similarity index 57% copy from hyppopy/tests/data/Iris/knc_config.xml copy to hyppopy/tests/data/Iris/rf_grid_config.xml index eb253bb..2d5c55f 100644 --- a/hyppopy/tests/data/Iris/knc_config.xml +++ b/hyppopy/tests/data/Iris/rf_grid_config.xml @@ -1,36 +1,30 @@ - + uniform - [1,50] + [1,300,10] int - - - uniform - [1,100] - int - - - categorical - [uniform,distance] - str - - + + categorical - [auto,ball_tree,kd_tree,brute] + [gini,entropy] str - + + + uniform + [1,50,10] + int + - 3 - hyperopt + gridsearch D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris D:/Projects/Python/hyppopy/hyppopy/tests/data/Iris train_data.npy train_labels.npy \ No newline at end of file diff --git a/hyppopy/tests/data/Titanic/lda_config.xml b/hyppopy/tests/data/Titanic/adaboost_config.xml similarity index 73% rename from hyppopy/tests/data/Titanic/lda_config.xml rename to hyppopy/tests/data/Titanic/adaboost_config.xml index 556ff45..48b7b88 100644 --- a/hyppopy/tests/data/Titanic/lda_config.xml +++ b/hyppopy/tests/data/Titanic/adaboost_config.xml @@ -1,26 +1,26 @@ - - categorical - [svd,lsqr,eigen] - str - - + uniform - [0.0,1.0] + [1, 100] + int + + + loguniform + [-10,3] float - + 3 hyperopt D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic train_cleaned.csv Survived \ No newline at end of file diff --git a/hyppopy/tests/data/Titanic/knc_config.xml b/hyppopy/tests/data/Titanic/knc_config.xml index 641beb6..91c3d59 100644 --- a/hyppopy/tests/data/Titanic/knc_config.xml +++ b/hyppopy/tests/data/Titanic/knc_config.xml @@ -1,36 +1,36 @@ uniform [1,50] int uniform [1,100] int categorical [uniform,distance] str - + categorical [auto,ball_tree,kd_tree,brute] str - + 3 hyperopt D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic D:/Projects/Python/hyppopy/hyppopy/tests/data/Titanic train_cleaned.csv Survived \ No newline at end of file diff --git a/hyppopy/tests/test_helpers.py b/hyppopy/tests/test_helpers.py new file mode 100644 index 0000000..e1071b3 --- /dev/null +++ b/hyppopy/tests/test_helpers.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +# +# DKFZ +# +# +# Copyright (c) German Cancer Research Center, +# Division of Medical and Biological Informatics. +# All rights reserved. +# +# This software is distributed WITHOUT ANY WARRANTY; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# See LICENSE.txt or http://www.mitk.org for details. +# +# Author: Sven Wanner (s.wanner@dkfz.de) + +import unittest + +from hyppopy.helpers import NestedDictUnfolder + + +class SolverFactoryTestSuite(unittest.TestCase): + + def setUp(self): + self.p1 = {"uni1": [1, 2], "uni2": [11, 12]} + self.p2 = {"cat": {"a": {"uni1": [1, 2], "uni2": [11, 12]}, "b": {"uni1": [1, 2], "uni2": [11, 12]}}} + self.p3 = {"cat1": { + "a1": {"cat2": {"a2": {"uni1": [1, 2], "uni2": [11, 12]}, "b2": {"uni1": [1, 2], "uni2": [11, 12]}}}, + "b1": {"cat2": {"a2": {"uni1": [1, 2], "uni2": [11, 12]}, "b2": {"uni1": [1, 2], "uni2": [11, 12]}}}}} + + self.output_p3 = [{'cat1': 'a1', 'cat2': 'a2', 'uni1': 1, 'uni2': 11}, + {'cat1': 'a1', 'cat2': 'a2', 'uni1': 1, 'uni2': 12}, + {'cat1': 'a1', 'cat2': 'a2', 'uni1': 2, 'uni2': 11}, + {'cat1': 'a1', 'cat2': 'a2', 'uni1': 2, 'uni2': 12}, + {'cat1': 'a1', 'cat2': 'b2', 'uni1': 1, 'uni2': 11}, + {'cat1': 'a1', 'cat2': 'b2', 'uni1': 1, 'uni2': 12}, + {'cat1': 'a1', 'cat2': 'b2', 'uni1': 2, 'uni2': 11}, + {'cat1': 'a1', 'cat2': 'b2', 'uni1': 2, 'uni2': 12}, + {'cat1': 'b1', 'cat2': 'a2', 'uni1': 1, 'uni2': 11}, + {'cat1': 'b1', 'cat2': 'a2', 'uni1': 1, 'uni2': 12}, + {'cat1': 'b1', 'cat2': 'a2', 'uni1': 2, 'uni2': 11}, + {'cat1': 'b1', 'cat2': 'a2', 'uni1': 2, 'uni2': 12}, + {'cat1': 'b1', 'cat2': 'b2', 'uni1': 1, 'uni2': 11}, + {'cat1': 'b1', 'cat2': 'b2', 'uni1': 1, 'uni2': 12}, + {'cat1': 'b1', 'cat2': 'b2', 'uni1': 2, 'uni2': 11}, + {'cat1': 'b1', 'cat2': 'b2', 'uni1': 2, 'uni2': 12}] + + self.output_p2 = [{'cat': 'a', 'uni1': 1, 'uni2': 11}, + {'cat': 'a', 'uni1': 1, 'uni2': 12}, + {'cat': 'a', 'uni1': 2, 'uni2': 11}, + {'cat': 'a', 'uni1': 2, 'uni2': 12}, + {'cat': 'b', 'uni1': 1, 'uni2': 11}, + {'cat': 'b', 'uni1': 1, 'uni2': 12}, + {'cat': 'b', 'uni1': 2, 'uni2': 11}, + {'cat': 'b', 'uni1': 2, 'uni2': 12}] + + self.output_p1 = [{'uni1': 1, 'uni2': 11}, + {'uni1': 1, 'uni2': 12}, + {'uni1': 2, 'uni2': 11}, + {'uni1': 2, 'uni2': 12}] + + def test_nested_dict_unfolder_p1(self): + unfolder = NestedDictUnfolder(self.p1) + unfolded = unfolder.unfold() + + for it1, it2 in zip(unfolded, self.output_p1): + self.assertEqual(it1, it2) + + def test_nested_dict_unfolder_p2(self): + unfolder = NestedDictUnfolder(self.p2) + unfolded = unfolder.unfold() + + for it1, it2 in zip(unfolded, self.output_p2): + self.assertEqual(it1, it2) + + def test_nested_dict_unfolder_p3(self): + unfolder = NestedDictUnfolder(self.p3) + unfolded = unfolder.unfold() + for it1, it2 in zip(unfolded, self.output_p3): + self.assertEqual(it1, it2) + + + +if __name__ == '__main__': + unittest.main() + diff --git a/hyppopy/tests/test_settings_plugins.py b/hyppopy/tests/test_settings_plugins.py index a4f2ba4..984c7c5 100644 --- a/hyppopy/tests/test_settings_plugins.py +++ b/hyppopy/tests/test_settings_plugins.py @@ -1,110 +1,129 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) -import os import unittest from hyppopy.plugins.gridsearch_settings_plugin import gridsearch_SettingsParticle from hyppopy.plugins.gridsearch_settings_plugin import gridsearch_Settings + class ProjectManagerTestSuite(unittest.TestCase): def setUp(self): self.hp = { 'UniformFloat': { 'domain': 'uniform', 'data': [0, 1, 10], 'type': 'float', }, 'UniformInt': { 'domain': 'uniform', 'data': [0, 7, 10], 'type': 'int', }, 'NormalFloat': { 'domain': 'normal', 'data': [0, 1, 10], 'type': 'float', }, 'NormalInt': { 'domain': 'normal', 'data': [0, 10, 10], 'type': 'int', }, 'LogFloat': { 'domain': 'loguniform', 'data': [-5, 5, 10], 'type': 'float', }, 'LogFloat': { 'domain': 'loguniform', 'data': [-5, 5, 10], 'type': 'float', }, 'LogInt': { 'domain': 'loguniform', 'data': [0, 6, 10], 'type': 'int', }, 'CategoricalStr': { 'domain': 'categorical', 'data': ['a', 'b'], 'type': 'str', }, 'CategoricalInt': { 'domain': 'categorical', 'data': [0, 1], 'type': 'int', } } self.truth = { 'UniformFloat': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'UniformInt': [0, 1, 2, 3, 4, 5, 6, 7, 8], 'NormalFloat': [0.0, 0.2592443381276233, 0.3673134565097225, 0.4251586871937128, 0.4649150940720099, 0.5, 0.5350849059279901, 0.5748413128062873, 0.6326865434902775, 0.7407556618723767, 1.0], 'NormalInt': [0, 3, 4, 5, 6, 7, 10], 'LogFloat': [0.006737946999085467, 0.01831563888873418, 0.049787068367863944, 0.1353352832366127, 0.36787944117144233, 1.0, 2.718281828459045, 7.38905609893065, 20.085536923187668, 54.598150033144236, 148.4131591025766], 'LogInt': [1, 2, 3, 6, 11, 20, 37, 67, 122, 221, 403], 'CategoricalStr': ['a', 'b'], 'CategoricalInt': [0, 1] } - def test_gridsearch_settings(self): gss = gridsearch_Settings() gss.set_hyperparameter(self.hp) res = gss.get_hyperparameter() - # TODO check... + self.assertTrue('CategoricalInt' in res.keys()) + self.assertTrue(len(res) == 1) + self.assertTrue(0 in res['CategoricalInt'].keys()) + self.assertTrue(1 in res['CategoricalInt'].keys()) + self.assertTrue(len(res['CategoricalInt']) == 2) + self.assertTrue('a' in res['CategoricalInt'][0]['CategoricalStr'].keys()) + self.assertTrue('b' in res['CategoricalInt'][0]['CategoricalStr'].keys()) + self.assertTrue(len(res['CategoricalInt'][0]['CategoricalStr']) == 2) + self.assertTrue('a' in res['CategoricalInt'][1]['CategoricalStr'].keys()) + self.assertTrue('b' in res['CategoricalInt'][1]['CategoricalStr'].keys()) + self.assertTrue(len(res['CategoricalInt'][1]['CategoricalStr']) == 2) + def check_truth(input_dict): + for key, value in self.truth.items(): + if not key.startswith('Categorical'): + self.assertTrue(key in input_dict.keys()) + for n, v in enumerate(self.truth[key]): + self.assertAlmostEqual(v, input_dict[key][n]) + + check_truth(res['CategoricalInt'][0]['CategoricalStr']['a']) + check_truth(res['CategoricalInt'][1]['CategoricalStr']['a']) + check_truth(res['CategoricalInt'][0]['CategoricalStr']['b']) + check_truth(res['CategoricalInt'][1]['CategoricalStr']['b']) def test_gridsearch_particle(self): for name, data in self.hp.items(): gsp = gridsearch_SettingsParticle(name=name, domain=data['domain'], - dtype=data['dtype'], + dtype=data['type'], data=data['data']) data = gsp.get() for n in range(len(self.truth[name])): self.assertAlmostEqual(data[n], self.truth[name][n]) - def tearDown(self): pass if __name__ == '__main__': unittest.main() diff --git a/hyppopy/tests/test_usecases.py b/hyppopy/tests/test_usecases.py index 4846a4b..bdc7da2 100644 --- a/hyppopy/tests/test_usecases.py +++ b/hyppopy/tests/test_usecases.py @@ -1,181 +1,216 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import shutil import unittest import tempfile import numpy as np from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.datasets import load_breast_cancer +from sklearn.ensemble import AdaBoostClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase -from hyppopy.workflows.lda_usecase.adaboost_usecase import lda_usecase +from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") class ProjectManagerTestSuite(unittest.TestCase): def setUp(self): breast_cancer_data = load_breast_cancer() x = breast_cancer_data.data y = breast_cancer_data.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23) self.root = os.path.join(tempfile.gettempdir(), 'test_data') - if not os.path.isdir(self.root): - os.makedirs(self.root) + if os.path.isdir(self.root): + shutil.rmtree(self.root) + os.makedirs(self.root) + x_train_fname = os.path.join(self.root, 'x_train.npy') y_train_fname = os.path.join(self.root, 'y_train.npy') np.save(x_train_fname, x_train) np.save(y_train_fname, y_train) + self.train = [x_train, y_train] self.test = [x_test, y_test] self.config = { "hyperparameter": {}, "settings": { "solver_plugin": { - "max_iterations": 50, + "max_iterations": 3, "use_plugin": "hyperopt", "output_dir": os.path.join(self.root, 'test_results') }, "custom": { "data_path": self.root, "data_name": "x_train.npy", "labels_name": "y_train.npy" } }} - def test_svc_usecase(self): - hyperparameter = { - "C": { - "domain": "uniform", - "data": [0.0001, 300.0], - "type": "float" - }, - "kernel": { - "domain": "categorical", - "data": ["linear", "poly", "rbf"], - "type": "str" - } - } - - self.config["hyperparameter"] = hyperparameter - ProjectManager.set_config(self.config) - uc = svc_usecase() - uc.run(save=True) - res, best = uc.get_results() - print("="*30) - print(best) - print("=" * 30) - clf = SVC(**best) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) + # def test_svc_usecase(self): + # hyperparameter = { + # "C": { + # "domain": "uniform", + # "data": [0.0001, 300.0], + # "type": "float" + # }, + # "kernel": { + # "domain": "categorical", + # "data": ["linear", "poly", "rbf"], + # "type": "str" + # } + # } + # + # self.config["hyperparameter"] = hyperparameter + # ProjectManager.set_config(self.config) + # uc = svc_usecase() + # uc.run(save=True) + # res, best = uc.get_results() + # print("="*30) + # print(best) + # print("=" * 30) + # clf = SVC(C=best['C'], kernel=hyperparameter['kernel']['data'][best['kernel']]) + # clf.fit(self.train[0], self.train[1]) + # train_predictions = clf.predict(self.test[0]) + # acc = accuracy_score(self.test[1], train_predictions) + # print("Accuracy: {:.4%}".format(acc)) + # print("=" * 30) def test_randomforest_usecase(self): hyperparameter = { "n_estimators": { "domain": "uniform", "data": [1, 500], "type": "int" }, "criterion": { "domain": "categorical", "data": ["gini", "entropy"], "type": "str" }, "max_depth": { "domain": "uniform", "data": [1, 50], "type": "int" }, "max_features": { "domain": "categorical", "data": ["auto", "sqrt", "log2"], "type": "str" } } self.config["hyperparameter"] = hyperparameter ProjectManager.set_config(self.config) uc = randomforest_usecase() - uc.run(save=True) + uc.run(save=False) res, best = uc.get_results() + print("=" * 30) print(best) + print("=" * 30) + clf = RandomForestClassifier(n_estimators=best['n_estimators'], + criterion=hyperparameter['criterion']['data'][best['criterion']], + max_depth=best['max_depth'], + max_features=best['max_features']) + clf.fit(self.train[0], self.train[1]) + print("feature importance:\n", clf.feature_importances_) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) - def test_lda_usecase(self): + def test_adaboost_usecase(self): hyperparameter = { - "solver": { - "domain": "categorical", - "data": ["svd", "lsqr", "eigen"], - "type": "str" - }, - "tol": { + "n_estimators": { "domain": "uniform", - "data": [0.00000001, 1.0], + "data": [1, 300], + "type": "int" + }, + "learning_rate": { + "domain": "loguniform", + "data": [-10, 3], "type": "float" } } self.config["hyperparameter"] = hyperparameter ProjectManager.set_config(self.config) - uc = lda_usecase() + uc = adaboost_usecase() uc.run(save=True) res, best = uc.get_results() + print("=" * 30) print(best) + print("=" * 30) + clf = AdaBoostClassifier(n_estimators=best['n_estimators'], learning_rate=best['learning_rate']) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) def test_knc_usecase(self): hyperparameter = { "n_neighbors": { "domain": "uniform", "data": [1, 100], "type": "int" }, "weights": { "domain": "categorical", "data": ["uniform", "distance"], "type": "str" }, "algorithm": { "domain": "categorical", "data": ["auto", "ball_tree", "kd_tree", "brute"], "type": "str" } } self.config["hyperparameter"] = hyperparameter ProjectManager.set_config(self.config) uc = knc_usecase() uc.run(save=True) res, best = uc.get_results() + print("=" * 30) print(best) + print("=" * 30) + clf = KNeighborsClassifier(n_neighbors=best['n_neighbors'], + weights=hyperparameter['weights']['data'][best['weights']], + algorithm=hyperparameter['algorithm']['data'][best['algorithm']]) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) def tearDown(self): pass - # if os.path.isdir(self.root): - # shutil.rmtree(self.root) if __name__ == '__main__': unittest.main() diff --git a/hyppopy/tests/test_workflows.py b/hyppopy/tests/test_workflows.py index f8783d6..2866495 100644 --- a/hyppopy/tests/test_workflows.py +++ b/hyppopy/tests/test_workflows.py @@ -1,120 +1,151 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import unittest from hyppopy.globals import TESTDATA_DIR IRIS_DATA = os.path.join(TESTDATA_DIR, 'Iris') TITANIC_DATA = os.path.join(TESTDATA_DIR, 'Titanic') from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase +from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase +from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase class WorkflowTestSuite(unittest.TestCase): def setUp(self): self.results = [] def test_workflow_svc_on_iris_from_xml(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'svc_config.xml')) uc = svc_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('C' in res.columns) self.assertTrue('gamma' in res.columns) self.assertTrue('kernel' in res.columns) self.assertEqual(len(best.keys()), 3) def test_workflow_svc_on_iris_from_json(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'svc_config.json')) uc = svc_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('C' in res.columns) self.assertTrue('gamma' in res.columns) self.assertTrue('kernel' in res.columns) self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_iris_from_xml(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_config.xml')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('n_estimators' in res.columns) self.assertTrue('criterion' in res.columns) self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_iris_from_json(self): ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_config.json')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('n_estimators' in res.columns) self.assertTrue('criterion' in res.columns) self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) + def test_workflow_rf_on_iris_from_grid_xml(self): + ProjectManager.read_config(os.path.join(IRIS_DATA, 'rf_grid_config.xml')) + uc = randomforest_usecase() + uc.run(False) + res, best = uc.get_results() + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('criterion' in res.columns) + self.assertTrue('max_depth' in res.columns) + self.assertEqual(len(best.keys()), 3) + # def test_workflow_svc_on_titanic_from_xml(self): # ProjectManager.read_config(os.path.join(TITANIC_DATA, 'svc_config.xml')) # uc = svc_usecase() # uc.run(False) # res, best = uc.get_results() # self.assertTrue('C' in res.columns) # self.assertTrue('gamma' in res.columns) # self.assertTrue('kernel' in res.columns) # self.assertEqual(len(best.keys()), 3) # # def test_workflow_svc_on_titanic_from_json(self): # ProjectManager.read_config(os.path.join(TITANIC_DATA, 'svc_config.json')) # uc = svc_usecase() # uc.run(False) # res, best = uc.get_results() # self.assertTrue('C' in res.columns) # self.assertTrue('gamma' in res.columns) # self.assertTrue('kernel' in res.columns) # self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_titanic_from_xml(self): ProjectManager.read_config(os.path.join(TITANIC_DATA, 'rf_config.xml')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('n_estimators' in res.columns) self.assertTrue('criterion' in res.columns) self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) def test_workflow_rf_on_titanic_from_json(self): ProjectManager.read_config(os.path.join(TITANIC_DATA, 'rf_config.json')) uc = randomforest_usecase() uc.run(False) res, best = uc.get_results() self.assertTrue('n_estimators' in res.columns) self.assertTrue('criterion' in res.columns) self.assertTrue('max_depth' in res.columns) self.assertEqual(len(best.keys()), 3) + def test_workflow_adaboost_on_titanic_from_xml(self): + ProjectManager.read_config(os.path.join(TITANIC_DATA, 'adaboost_config.xml')) + uc = adaboost_usecase() + uc.run(False) + res, best = uc.get_results() + self.assertTrue('n_estimators' in res.columns) + self.assertTrue('learning_rate' in res.columns) + self.assertEqual(len(best.keys()), 2) + + def test_workflow_knc_on_titanic_from_xml(self): + ProjectManager.read_config(os.path.join(TITANIC_DATA, 'knc_config.xml')) + uc = knc_usecase() + uc.run(False) + res, best = uc.get_results() + self.assertTrue('n_neighbors' in res.columns) + self.assertTrue('leaf_size' in res.columns) + self.assertTrue('weights' in res.columns) + self.assertEqual(len(best.keys()), 4) + def tearDown(self): print("") for r in self.results: print(r) if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 36f0629..885c37d 100644 --- a/setup.py +++ b/setup.py @@ -1,63 +1,63 @@ # -*- coding: utf-8 -*- import os from setuptools import setup, find_packages with open('README.rst') as f: readme = f.read() with open('LICENSE') as f: license = f.read() -VERSION = "0.1.2dev" +VERSION = "0.2.0" ROOT = os.path.dirname(os.path.realpath(__file__)) new_init = [] with open(os.path.join(ROOT, *("hyppopy", "__init__.py")), "r") as infile: for line in infile: new_init.append(line) for n in range(len(new_init)): if new_init[n].startswith("__version__"): split = line.split("=") new_init[n] = "__version__ = '" + VERSION + "'\n" with open(os.path.join(ROOT, *("hyppopy", "__init__.py")), "w") as outfile: outfile.writelines(new_init) setup( name='hyppopy', version=VERSION, description='Hyper-Parameter Optimization Toolbox for Blackboxfunction Optimization', long_description=readme, # if you want, put your own name here # (this would likely result in people sending you emails) author='Sven Wanner', author_email='s.wanner@dkfz.de', url='', license=license, packages=find_packages(exclude=('*test*', 'doc')), package_data={ 'hyppopy.plugins': ['*.yapsy-plugin'] }, # the requirements to install this project. # Since this one is so simple this is empty. install_requires=[ 'dicttoxml>=1.7.4', 'xmltodict>=0.11.0', 'hyperopt>=0.1.1', 'Optunity>=1.1.1', 'numpy>=1.16.0', 'matplotlib>=3.0.2', 'scikit-learn>=0.20.2', 'scipy>=1.2.0', 'Sphinx>=1.8.3', 'xmlrunner>=1.7.7', 'Yapsy>=1.11.223', 'pandas>=0.24.1', 'seaborn>=0.9.0', 'deap>=1.2.2', 'bayesian-optimization>=1.0.1' ], )