diff --git a/examples/performance_test.py b/examples/performance_test.py index 5041119..2f5c523 100644 --- a/examples/performance_test.py +++ b/examples/performance_test.py @@ -1,529 +1,583 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os +import sys import shutil +import argparse import tempfile import numpy as np import pandas as pd import seaborn as sns from sklearn.svm import SVC import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score from sklearn.ensemble import AdaBoostClassifier from sklearn.datasets import load_breast_cancer from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier from hyppopy.projectmanager import ProjectManager from hyppopy.workflows.svc_usecase.svc_usecase import svc_usecase from hyppopy.workflows.knc_usecase.knc_usecase import knc_usecase from hyppopy.workflows.adaboost_usecase.adaboost_usecase import adaboost_usecase from hyppopy.workflows.randomforest_usecase.randomforest_usecase import randomforest_usecase from hyppopy.workflows.gradientboost_usecase.gradientboost_usecase import gradientboost_usecase sns.set(style="ticks") sns.set(style="darkgrid") class PerformanceTest(object): - def __init__(self, root=None): + def __init__(self, root=None, size='small', plugin='hyperopt'): if root is None: self.root = os.path.join(tempfile.gettempdir(), 'test_data') else: self.root = os.path.join(root, 'test_data') if not os.path.isdir(self.root): os.makedirs(self.root) self.test = None self.train = None self.config = None - self.iter_sequence = [5, 10, 25, 50, 100, 150, 300, 500, 800, 1200] - self.iter_sequence = [25] - - def run(self): + self.size = size + self.plugin = plugin + if plugin == 'gridsearch': + print("Gridsearch is not available for this performance test!") + sys.exit() + if size == 'small': + self.iter_sequence = [25] + elif size == 'medium': + self.iter_sequence = [50, 150, 500] + elif size == 'big': + self.iter_sequence = [5, 10, 25, 50, 100, 150, 300, 500, 800, 1200] + + + def run(self, usecase='all'): self.set_up() - #self.run_svc_usecase() - #self.run_gradientboost_usecase() - self.run_randomforest_usecase() - #self.run_adaboost_usecase() - #self.run_knc_usecase() - #self.clean_up() + print("") + print("#" * 40) + print("# Hyppopy performance test\n# usecase={}\n# size={}\n# plugin={}".format(usecase, self.size, self.plugin)) + print("#" * 40) + # if usecase == 'svc' or usecase == 'all': + # self.run_svc_usecase() + if usecase == 'knc' or usecase == 'all': + self.run_knc_usecase() + if usecase == 'randomforest' or usecase == 'all': + self.run_randomforest_usecase() + if usecase == 'gradientboost' or usecase == 'all': + self.run_gradientboost_usecase() + if usecase == 'adaboost' or usecase == 'all': + self.run_adaboost_usecase() def set_hyperparameter(self, params): self.config["hyperparameter"] = params + def set_output_dir(self, dirname): + self.config["settings"]["solver_plugin"]["output_dir"] = os.path.join(self.root, dirname) + def set_iterations(self, value): self.config["settings"]["solver_plugin"]["max_iterations"] = value def find_loss_and_time(self, solver_output, results): min_idx = solver_output["losses"].idxmin() results["losses"].append(solver_output["losses"][min_idx]) results["duration"].append(solver_output["duration"][min_idx]) def plot_matrix(self, df, name=""): sns_plot = sns.pairplot(df, height=1.8, aspect=1.8) fig = sns_plot.fig fig.subplots_adjust(top=0.93, wspace=0.3) t = fig.suptitle(name, fontsize=14) plt.show() return sns_plot def plot(self, df, x, y, name="", save=None, show=True): fig, axs = plt.subplots(nrows=len(y), ncols=len(x), figsize=(12.0, len(y)*3)) fig.subplots_adjust(left=0.08, right=0.98, wspace=0.3) argmin = df["losses"].idxmin() for nx, _x in enumerate(x): for ny, _y in enumerate(y): ax = axs[ny, nx] ax.plot(df[_x].values, df[_y].values, 'o') ax.plot(df[_x].values[argmin], df[_y].values[argmin], 'ro') ax.grid(True) if nx == 0: ax.set_ylabel(_y) if ny == len(y)-1: ax.set_xlabel(_x) fig.suptitle(name, fontsize=16) if save is not None: if not os.path.isdir(os.path.dirname(save)): os.makedirs(os.path.dirname(save)) plt.savefig(save) if show: plt.show() def set_up(self): breast_cancer_data = load_breast_cancer() x = breast_cancer_data.data y = breast_cancer_data.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23) x_train_fname = os.path.join(self.root, 'x_train.npy') y_train_fname = os.path.join(self.root, 'y_train.npy') if not os.path.isfile(x_train_fname): np.save(x_train_fname, x_train) if not os.path.isfile(y_train_fname): np.save(y_train_fname, y_train) self.train = [x_train, y_train] self.test = [x_test, y_test] self.config = { "hyperparameter": {}, "settings": { "solver_plugin": { "max_iterations": 1, - "use_plugin": "hyperopt", + "use_plugin": self.plugin, "output_dir": os.path.join(self.root, 'test_results') }, "custom": { "data_path": self.root, "data_name": "x_train.npy", "labels_name": "y_train.npy" } }} def run_svc_usecase(self): print("\n") print("*" * 30) print("SVC Classifier") print("*" * 30) print("\n") hp = { "C": { "domain": "uniform", "data": [0.0001, 300.0], "type": "float" }, "kernel": { "domain": "categorical", "data": ["linear", "poly", "rbf"], "type": "str" } } self.set_hyperparameter(hp) + self.set_output_dir("svc_usecase") results = {"iterations": [], "C": [], "kernel": [], "accuracy": [], "losses": [], "duration": []} + status = True for n in self.iter_sequence: - self.set_iterations(n) - ProjectManager.set_config(self.config) - uc = svc_usecase() - uc.run(save=False) - res, best = uc.get_results() - clf = SVC(C=best['n_estimators'], - kernel=hp['kernel']['data'][best['kernel']]) - clf.fit(self.train[0], self.train[1]) - train_predictions = clf.predict(self.test[0]) - acc = accuracy_score(self.test[1], train_predictions) - - results['accuracy'].append(acc) - results['iterations'].append(n) - results['kernel'].append(best['kernel']) - results['C'].append(best['C']) - - self.find_loss_and_time(res, results) - - print("=" * 30) - print("Number of iterations: {}".format(n)) - print("Classifier: {}".format(clf.__class__.__name__)) - print("=" * 30) - print("=" * 30) - for p in best.items(): - print(p[0], ":", p[1]) - print("=" * 30) - print("Accuracy: {:.4%}".format(acc)) - print("=" * 30) - print("\n") - - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'max_features'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) + try: + self.set_iterations(n) + ProjectManager.set_config(self.config) + uc = svc_usecase() + uc.run(save=True) + res, best = uc.get_results() + clf = SVC(C=best['n_estimators'], + kernel=hp['kernel']['data'][best['kernel']]) + clf.fit(self.train[0], self.train[1]) + train_predictions = clf.predict(self.test[0]) + acc = accuracy_score(self.test[1], train_predictions) + + results['accuracy'].append(acc) + results['iterations'].append(n) + results['kernel'].append(best['kernel']) + results['C'].append(best['C']) + + self.find_loss_and_time(res, results) + + print("=" * 30) + print("Number of iterations: {}".format(n)) + print("Classifier: {}".format(clf.__class__.__name__)) + print("=" * 30) + print("=" * 30) + for p in best.items(): + print(p[0], ":", p[1]) + print("=" * 30) + print("Accuracy: {:.4%}".format(acc)) + print("=" * 30) + print("\n") + + except Exception as e: + print("Failed at iteration step {}, reason: {}".format(n, e)) + status = False + + if status: + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + self.plot(df, x=["iterations", "losses"], + y=['accuracy', 'duration'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), + show=False) + self.plot(df, x=["iterations", "losses"], + y=['n_estimators', 'max_depth', 'max_features'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), + show=False) def run_randomforest_usecase(self): print("\n") print("*" * 30) print("RandomForest Classifier") print("*" * 30) print("\n") hp = { "n_estimators": { "domain": "uniform", "data": [3, 500], "type": "int" }, "max_depth": { "domain": "uniform", "data": [1, 50], "type": "int" }, "max_features": { "domain": "categorical", "data": ["auto", "sqrt", "log2"], "type": "str" } } self.set_hyperparameter(hp) + self.set_output_dir("randomforest_usecase") results = {"iterations": [], "n_estimators": [], "max_depth": [], "max_features": [], "accuracy": [], "losses": [], "duration": []} + status = True for n in self.iter_sequence: try: self.set_iterations(n) ProjectManager.set_config(self.config) uc = randomforest_usecase() - uc.run(save=False) + uc.run(save=True) res, best = uc.get_results() clf = RandomForestClassifier(n_estimators=best['n_estimators'], max_depth=best['max_depth'], - max_features=hp['max_features']['data'][best['max_features']]) + max_features=best['max_features']) clf.fit(self.train[0], self.train[1]) train_predictions = clf.predict(self.test[0]) acc = accuracy_score(self.test[1], train_predictions) results['accuracy'].append(acc) results['iterations'].append(n) results['n_estimators'].append(best['n_estimators']) results['max_depth'].append(best['max_depth']) results['max_features'].append(best['max_features']) self.find_loss_and_time(res, results) print("=" * 30) print("Number of iterations: {}".format(n)) print("Classifier: {}".format(clf.__class__.__name__)) print("=" * 30) print("=" * 30) for p in best.items(): print(p[0], ":", p[1]) print("=" * 30) print("Accuracy: {:.4%}".format(acc)) print("=" * 30) print("\n") except Exception as e: print("Failed at iteration step {}, reason: {}".format(n, e)) - - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'max_features'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) + status = False + + if status: + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + self.plot(df, x=["iterations", "losses"], + y=['accuracy', 'duration'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), + show=False) + self.plot(df, x=["iterations", "losses"], + y=['n_estimators', 'max_depth', 'max_features'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), + show=False) def run_adaboost_usecase(self): print("\n") print("*"*30) print("AdaBoost Classifier") print("*"*30) print("\n") hp = { "n_estimators": { "domain": "uniform", "data": [1, 500], "type": "int" }, "learning_rate": { - "domain": "uniform", - "data": [0.001, 10], + "domain": "loguniform", + "data": [-10, 2], "type": "float" } } self.set_hyperparameter(hp) + self.set_output_dir("adaboost_usecase") results = {"iterations": [], "n_estimators": [], "learning_rate": [], "accuracy": [], "losses": [], "duration": []} + status = True for n in self.iter_sequence: try: self.set_iterations(n) ProjectManager.set_config(self.config) uc = adaboost_usecase() - uc.run(save=False) + uc.run(save=True) res, best = uc.get_results() clf = AdaBoostClassifier(n_estimators=best['n_estimators'], learning_rate=best['learning_rate']) clf.fit(self.train[0], self.train[1]) train_predictions = clf.predict(self.test[0]) acc = accuracy_score(self.test[1], train_predictions) results['accuracy'].append(acc) results['iterations'].append(n) results['n_estimators'].append(best['n_estimators']) results['learning_rate'].append(best['learning_rate']) self.find_loss_and_time(res, results) print("=" * 30) print("Number of iterations: {}".format(n)) print("Classifier: {}".format(clf.__class__.__name__)) print("=" * 30) print("=" * 30) for p in best.items(): print(p[0], ":", p[1]) print("=" * 30) print("Accuracy: {:.4%}".format(acc)) print("=" * 30) print("\n") except Exception as e: print("Failed at iteration step {}, reason: {}".format(n, e)) - - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'learning_rate'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) + status = False + + if status: + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + self.plot(df, x=["iterations", "losses"], + y=['accuracy', 'duration'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), + show=False) + self.plot(df, x=["iterations", "losses"], + y=['n_estimators', 'learning_rate'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), + show=False) def run_knc_usecase(self): print("\n") print("*" * 30) print("KN Classifier") print("*" * 30) print("\n") hp = { "n_neighbors": { "domain": "uniform", "data": [1, 100], "type": "int" }, "weights": { "domain": "categorical", "data": ["uniform", "distance"], "type": "str" }, "algorithm": { "domain": "categorical", "data": ["auto", "ball_tree", "kd_tree", "brute"], "type": "str" } } self.set_hyperparameter(hp) + self.set_output_dir("knc_usecase") results = {"iterations": [], "n_neighbors": [], "weights": [], "algorithm": [], "accuracy": [], "losses": [], "duration": []} + status = True for n in self.iter_sequence: try: self.set_iterations(n) ProjectManager.set_config(self.config) uc = knc_usecase() - uc.run(save=False) + uc.run(save=True) res, best = uc.get_results() clf = KNeighborsClassifier(n_neighbors=best['n_neighbors'], - weights=hp["weights"]["data"][best['weights']], - algorithm=hp["algorithm"]["data"][best['algorithm']]) + weights=best['weights'], + algorithm=best['algorithm']) clf.fit(self.train[0], self.train[1]) train_predictions = clf.predict(self.test[0]) acc = accuracy_score(self.test[1], train_predictions) results['accuracy'].append(acc) results['iterations'].append(n) results['n_neighbors'].append(best['n_neighbors']) results['weights'].append(best['weights']) results['algorithm'].append(best['algorithm']) self.find_loss_and_time(res, results) print("=" * 30) print("Number of iterations: {}".format(n)) print("Classifier: {}".format(clf.__class__.__name__)) print("=" * 30) print("=" * 30) for p in best.items(): print(p[0], ":", p[1]) print("=" * 30) print("Accuracy: {:.4%}".format(acc)) print("=" * 30) print("\n") except Exception as e: print("Failed at iteration step {}, reason: {}".format(n, e)) - - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_neighbors', 'weights', 'algorithm'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) + status = False + + if status: + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + self.plot(df, x=["iterations", "losses"], + y=['accuracy', 'duration'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), + show=False) + self.plot(df, x=["iterations", "losses"], + y=['n_neighbors', 'weights', 'algorithm'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), + show=False) def run_gradientboost_usecase(self): print("\n") print("*" * 30) print("GradientBoost Classifier") print("*" * 30) print("\n") hp = { "n_estimators": { "domain": "uniform", "data": [3, 500], "type": "int" }, "learning_rate": { - "domain": "uniform", - "data": [0.001, 10], + "domain": "loguniform", + "data": [-10, 2], "type": "float" }, "min_samples_split": { "domain": "uniform", "data": [0.0, 1.0], "type": "float" }, "min_samples_leaf": { "domain": "uniform", "data": [0.00001, 0.5], "type": "float" }, "max_depth": { "domain": "uniform", "data": [1, 50], "type": "int" } } self.set_hyperparameter(hp) + self.set_output_dir("gradientboost_usecase") results = {"iterations": [], "n_estimators": [], "max_depth": [], "learning_rate": [], "min_samples_split": [], "min_samples_leaf": [], "accuracy": [], "losses": [], "duration": []} + status = True for n in self.iter_sequence: try: self.set_iterations(n) ProjectManager.set_config(self.config) uc = gradientboost_usecase() - uc.run(save=False) + uc.run(save=True) res, best = uc.get_results() clf = GradientBoostingClassifier(n_estimators=best['n_estimators'], max_depth=best['max_depth'], learning_rate=best['learning_rate'], min_samples_split=best['min_samples_split'], min_samples_leaf=best['min_samples_leaf']) clf.fit(self.train[0], self.train[1]) train_predictions = clf.predict(self.test[0]) acc = accuracy_score(self.test[1], train_predictions) results['accuracy'].append(acc) results['iterations'].append(n) results['n_estimators'].append(best['n_estimators']) results['max_depth'].append(best['max_depth']) results['learning_rate'].append(best['learning_rate']) results['min_samples_split'].append(best['min_samples_split']) results['min_samples_leaf'].append(best['min_samples_leaf']) self.find_loss_and_time(res, results) print("=" * 30) print("Number of iterations: {}".format(n)) print("Classifier: {}".format(clf.__class__.__name__)) print("=" * 30) print("=" * 30) for p in best.items(): print(p[0], ":", p[1]) print("=" * 30) print("Accuracy: {:.4%}".format(acc)) print("=" * 30) print("\n") except Exception as e: print("Failed at iteration step {}, reason: {}".format(n, e)) - - df = pd.DataFrame.from_dict(results) - df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) - self.plot(df, x=["iterations", "losses"], - y=['accuracy', 'duration'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), - show=False) - self.plot(df, x=["iterations", "losses"], - y=['n_estimators', 'max_depth', 'learning_rate', 'min_samples_split', 'min_samples_leaf'], - name="Classifier: {}".format(clf.__class__.__name__), - save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), - show=False) + status = False + + if status: + df = pd.DataFrame.from_dict(results) + df.to_csv(os.path.join(self.root, "final_{}.csv".format(clf.__class__.__name__))) + self.plot(df, x=["iterations", "losses"], + y=['accuracy', 'duration'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final1_{}.png".format(clf.__class__.__name__)), + show=False) + self.plot(df, x=["iterations", "losses"], + y=['n_estimators', 'max_depth', 'learning_rate', 'min_samples_split', 'min_samples_leaf'], + name="Classifier: {}".format(clf.__class__.__name__), + save=os.path.join(self.root, "final2_{}.png".format(clf.__class__.__name__)), + show=False) def clean_up(self): if os.path.isdir(self.root): shutil.rmtree(self.root) if __name__ == "__main__": - performance_test = PerformanceTest(root="C:/Users/s635r/Desktop") - performance_test.run() + parser = argparse.ArgumentParser(description='Hyppopy performance test.') + parser.add_argument('-u', '--usecase', type=str, help='usecase to be executed', default='all') + parser.add_argument('-s', '--size', type=str, help='size of the test [small, medium, big]', default='small') + parser.add_argument('-p', '--plugin', type=str, help='plugin to be used', default='hyperopt') + parser.add_argument('-o', '--output', type=str, default=None, help='output path to store result, default is temp on you system') + args = parser.parse_args() + + performance_test = PerformanceTest(root=args.output, size=args.size, plugin=args.plugin) + performance_test.run(usecase=args.usecase) diff --git a/hyppopy/resultviewer.py b/hyppopy/resultviewer.py index c68a44d..3a98034 100644 --- a/hyppopy/resultviewer.py +++ b/hyppopy/resultviewer.py @@ -1,174 +1,175 @@ # DKFZ # # # Copyright (c) German Cancer Research Center, # Division of Medical and Biological Informatics. # All rights reserved. # # This software is distributed WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. # # See LICENSE.txt or http://www.mitk.org for details. # # Author: Sven Wanner (s.wanner@dkfz.de) import os import copy import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import logging from hyppopy.globals import DEBUGLEVEL LOG = logging.getLogger(os.path.basename(__file__)) LOG.setLevel(DEBUGLEVEL) sns.set(style="darkgrid") class ResultViewer(object): def __init__(self, fname=None, save_only=False): + self.close_all() self.df = None self.has_duration = False self.hyperparameter = None self.save_only = save_only self.path = None self.appendix = None if fname is not None: self.read(fname) def close_all(self): plt.close('all') def read(self, fname): self.path = os.path.dirname(fname) split = os.path.basename(fname).split("_") self.appendix = split[-1] self.appendix = self.appendix[:-4] self.df = pd.read_csv(fname, index_col=0) const_data = ["duration", "losses"] hyperparameter_columns = [item for item in self.df.columns if item not in const_data] self.hyperparameter = pd.DataFrame() for key in hyperparameter_columns: self.hyperparameter[key] = self.df[key] self.has_duration = "duration" in self.df.columns def plot_XYGrid(self, df, x, y, name="", save=None, show=True): argmin = df["losses"].idxmin() grid = [len(x), len(y)] if grid[0] == 1 and grid[1] == 1: fig = plt.figure(figsize=(10.0, 8)) plt.plot(df[x[0]].values, df[y[0]].values, '.') plt.plot(df[x[0]].values[argmin], df[y[0]].values[argmin], 'ro') plt.grid(True) plt.ylabel(y[0]) plt.xlabel(x[0]) plt.title(name, fontsize=16) else: if grid[0] > 1 and grid[1] == 1: fig, axs = plt.subplots(ncols=grid[0], figsize=(10.0, grid[1] * 3.5)) elif grid[0] == 1 and grid[1] > 1: fig, axs = plt.subplots(nrows=grid[1], figsize=(10.0, grid[1] * 3.5)) else: fig, axs = plt.subplots(nrows=grid[1], ncols=grid[0], figsize=(10.0, grid[1] * 3.5)) fig.subplots_adjust(left=0.08, right=0.98, wspace=0.3) for nx, _x in enumerate(x): for ny, _y in enumerate(y): if grid[0] > 1 and grid[1] == 1: ax = axs[nx] elif grid[0] == 1 and grid[1] > 1: ax = axs[ny] else: ax = axs[ny, nx] ax.plot(df[_x].values, df[_y].values, '.') ax.plot(df[_x].values[argmin], df[_y].values[argmin], 'ro') ax.grid(True) if nx == 0: ax.set_ylabel(_y) if ny == len(y)-1: ax.set_xlabel(_x) fig.suptitle(name, fontsize=16) if save is not None: if not os.path.isdir(os.path.dirname(save)): os.makedirs(os.path.dirname(save)) plt.savefig(save) if show: plt.show() def plot_performance_and_feature_grids(self, save=True): x_axis = [] if 'losses' in self.df.columns: x_axis.append('losses') if 'iterations' in self.df.columns: x_axis.append('iterations') y_axis_performance = [] if 'accuracy' in self.df.columns: y_axis_performance.append('accuracy') if 'duration' in self.df.columns: y_axis_performance.append('duration') features = [] for cit in self.df.columns: if cit not in x_axis and cit not in y_axis_performance: features.append(cit) save_name = None if save: save_name = os.path.join(self.path, "performance" + self.appendix + ".png") self.plot_XYGrid(self.df, x=x_axis, y=y_axis_performance, name="Performance", save=save_name, show=not self.save_only) chunks = [features[x:x + 3] for x in range(0, len(features), 3)] for n, chunk in enumerate(chunks): save_name = None if save: save_name = os.path.join(self.path, "features_{}_".format(str(n).zfill(3)) + self.appendix + ".png") self.plot_XYGrid(self.df, x=x_axis, y=chunk, name="Feature set {}".format(n+1), save=save_name, show=not self.save_only) def plot_feature_matrix(self, save=True): sns_plot = sns.pairplot(self.df, height=1.8, aspect=1.8, plot_kws=dict(edgecolor="k", linewidth=0.5), diag_kind="kde", diag_kws=dict(shade=True)) fig = sns_plot.fig fig.subplots_adjust(top=0.93, wspace=0.3) t = fig.suptitle('Pairwise Plots', fontsize=14) if not self.save_only: plt.show() if save: save_name = os.path.join(self.path, "matrixview_"+self.appendix+".png") try: sns_plot.savefig(save_name) except Exception as e: msg = "failed to save file {}, reason {}".format(save_name, e) LOG.error(msg) raise IOError(msg) def plot_duration(self, save=True): if "duration" in self.df.columns: sns_plot = sns.jointplot(y="duration", x="losses", data=self.df, kind="kde") if not self.save_only: plt.show() if save: save_name = os.path.join(self.path, "t_vs_loss_" + self.appendix + ".png") try: sns_plot.savefig(save_name) except Exception as e: msg = "failed to save file {}, reason {}".format(save_name, e) LOG.error(msg) raise IOError(msg) def show(self, save=True): self.plot_duration(save) self.plot_feature_matrix(save) self.plot_performance_and_feature_grids(save)