diff --git a/default_configs.py b/default_configs.py
index 50b3a04..4d90b82 100644
--- a/default_configs.py
+++ b/default_configs.py
@@ -1,144 +1,143 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """Default Configurations script. Avoids changing configs of all experiments if general settings are to be changed."""
 
 import os
 
 class DefaultConfigs:
 
     def __init__(self, model, server_env=None, dim=2):
         self.server_env = server_env
         #########################
         #         I/O           #
         #########################
 
         self.model = model
         self.dim = dim
         # int [0 < dataset_size]. select n patients from dataset for prototyping.
         self.select_prototype_subset = None
 
         # some default paths.
         self.backbone_path = 'models/backbone.py'
         self.source_dir = os.path.dirname(os.path.realpath(__file__)) #current dir.
         self.input_df_name = 'info_df.pickle'
         self.model_path = 'models/{}.py'.format(self.model)
 
         if server_env:
             self.source_dir = '/home/jaegerp/code/mamma_code/medicaldetectiontoolkit'
 
-
         #########################
         #      Data Loader      #
         #########################
 
         #random seed for fold_generator and batch_generator.
         self.seed = 0
 
         #number of threads for multithreaded batch generation.
         self.n_workers = os.cpu_count() - 1
 
         # if True, segmentation losses learn all categories, else only foreground vs. background.
         self.class_specific_seg_flag = False
 
         #########################
         #      Architecture      #
         #########################
 
         self.weight_decay = 0.0
         # what weight or layer types to exclude from weight decay. options: ["bias", "norm"].
         self.exclude_from_wd = ("norm",)
 
         # nonlinearity to be applied after convs with nonlinearity. one of 'relu' or 'leaky_relu'
         self.relu = 'relu'
 
         # if True initializes weights as specified in model script. else use default Pytorch init.
         self.custom_init = False
 
         # if True adds high-res decoder levels to feature pyramid: P1 + P0. (e.g. set to true in retina_unet configs)
         self.operate_stride1 = False
 
         #########################
         #  Schedule             #
         #########################
 
         # number of folds in cross validation.
         self.n_cv_splits = 5
 
 
         # number of probabilistic samples in validation.
         self.n_probabilistic_samples = None
 
         #########################
         #   Testing / Plotting  #
         #########################
 
         # perform mirroring at test time. (only XY. Z not done to not blow up predictions times).
         self.test_aug = True
 
         # if True, test data lies in a separate folder and is not part of the cross validation.
         self.hold_out_test_set = False
 
         # if hold_out_test_set provided, ensemble predictions over models of all trained cv-folds.
         # implications for hold-out test sets: if True, evaluate folds separately on the test set, aggregate only the
         # evaluations. if False, aggregate the raw predictions across all folds, then evaluate.
         self.ensemble_folds = False
 
         # color specifications for all box_types in prediction_plot.
         self.box_color_palette = {'det': 'b', 'gt': 'r', 'neg_class': 'purple',
                                   'prop': 'w', 'pos_class': 'g', 'pos_anchor': 'c', 'neg_anchor': 'c'}
 
         # scan over confidence score in evaluation to optimize it on the validation set.
         self.scan_det_thresh = False
 
         # plots roc-curves / prc-curves in evaluation.
         self.plot_stat_curves = False
 
         # evaluates average precision per image and averages over images. instead computing one ap over data set.
         self.per_patient_ap = False
 
         # threshold for clustering 2D box predictions to 3D Cubes. Overlap is computed in XY.
         self.merge_3D_iou = 0.1
 
         # monitor any value from training.
         self.n_monitoring_figures = 1
         # dict to assign specific plot_values to monitor_figures > 0. {1: ['class_loss'], 2: ['kl_loss', 'kl_sigmas']}
         self.assign_values_to_extra_figure = {}
 
         # save predictions to csv file in experiment dir.
         self.save_preds_to_csv = True
 
         # select a maximum number of patient cases to test. number or "all" for all
         self.max_test_patients = "all"
 
         #########################
         #   MRCNN               #
         #########################
 
         # if True, mask loss is not applied. used for data sets, where no pixel-wise annotations are provided.
         self.frcnn_mode = False
 
         # if True, unmolds masks in Mask R-CNN to full-res for plotting/monitoring.
         self.return_masks_in_val = False
         self.return_masks_in_test = False # needed if doing instance segmentation. evaluation not yet implemented.
 
         # add P6 to Feature Pyramid Network.
         self.sixth_pooling = False
 
         # for probabilistic detection
         self.n_latent_dims = 0
 
 
diff --git a/exec.py b/exec.py
index 700063e..7ce003a 100644
--- a/exec.py
+++ b/exec.py
@@ -1,290 +1,294 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """execution script."""
 
 import argparse
 import os, warnings
 import time
 
 import torch
 
 import utils.exp_utils as utils
 from evaluator import Evaluator
 from predictor import Predictor
 from plotting import plot_batch_prediction
 
 for msg in ["Attempting to set identical bottom==top results",
             "This figure includes Axes that are not compatible with tight_layout",
             "Data has no positive values, and therefore cannot be log-scaled.",
             ".*invalid value encountered in double_scalars.*",
             ".*Mean of empty slice.*"]:
     warnings.filterwarnings("ignore", msg)
 
 
 def train(logger):
     """
     perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
     specified in the configs.
     """
     logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format(
         cf.dim, cf.fold, cf.exp_dir, cf.model))
 
     net = model.net(cf, logger).cuda()
     optimizer = torch.optim.AdamW(utils.parse_params_for_optim(net, weight_decay=cf.weight_decay,
                                                                exclude_from_wd=cf.exclude_from_wd),
                                   lr=cf.learning_rate[0])
     if cf.dynamic_lr_scheduling:
         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor,
                                                                patience=cf.scheduling_patience)
 
     model_selector = utils.ModelSelector(cf, logger)
     train_evaluator = Evaluator(cf, logger, mode='train')
     val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)
 
     starting_epoch = 1
 
     # prepare monitoring
     monitor_metrics = utils.prepare_monitoring(cf)
 
     if cf.resume:
         checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint")
         starting_epoch, net, optimizer, monitor_metrics = \
             utils.load_checkpoint(checkpoint_path, net, optimizer)
         logger.info('resumed from checkpoint {} to epoch {}'.format(checkpoint_path, starting_epoch))
 
 
     logger.info('loading dataset and initializing batch generators...')
     batch_gen = data_loader.get_train_generators(cf, logger)
 
     for epoch in range(starting_epoch, cf.num_epochs + 1):
 
         logger.info('starting training epoch {}'.format(epoch))
         start_time = time.time()
 
         net.train()
         train_results_list = []
         for bix in range(cf.num_train_batches):
             batch = next(batch_gen['train'])
             tic_fw = time.time()
             results_dict = net.train_forward(batch)
             tic_bw = time.time()
             optimizer.zero_grad()
             results_dict['torch_loss'].backward()
             optimizer.step()
             print('\rtr. batch {0}/{1} (ep. {2}) fw {3:.2f}s / bw {4:.2f} s / total {5:.2f} s || '.format(
                 bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw, time.time() - tic_bw,
                 time.time() - tic_fw) + results_dict['logger_string'], flush=True, end="")
             train_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"]))
         print()
 
         _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(train_results_list, monitor_metrics['train'])
 
         logger.info('generating training example plot.')
-        plot_batch_prediction(batch, results_dict, cf, outfile=os.path.join(
-            cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold)))
+        utils.split_off_process(plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join(
+           cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold)))
 
         train_time = time.time() - start_time
 
         logger.info('starting validation in mode {}.'.format(cf.val_mode))
         with torch.no_grad():
             net.eval()
             if cf.do_validation:
                 val_results_list = []
                 val_predictor = Predictor(cf, net, logger, mode='val')
                 for _ in range(batch_gen['n_val']):
                     batch = next(batch_gen[cf.val_mode])
                     if cf.val_mode == 'val_patient':
                         results_dict = val_predictor.predict_patient(batch)
                     elif cf.val_mode == 'val_sampling':
                         results_dict = net.train_forward(batch, is_validation=True)
                     #val_results_list.append([results_dict['boxes'], batch['pid']])
                     val_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"]))
 
                 _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(val_results_list, monitor_metrics['val'])
                 model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch)
 
             # update monitoring and prediction plots
             monitor_metrics.update({"lr":
                                         {str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups)}})
             logger.metrics2tboard(monitor_metrics, global_step=epoch)
 
             epoch_time = time.time() - start_time
             logger.info('trained epoch {}: took {:.2f} s ({:.2f} s train / {:.2f} s val)'.format(
                 epoch, epoch_time, train_time, epoch_time-train_time))
             batch = next(batch_gen['val_sampling'])
             results_dict = net.train_forward(batch, is_validation=True)
             logger.info('generating validation-sampling example plot.')
-            plot_batch_prediction(batch, results_dict, cf, outfile=os.path.join(
+            utils.split_off_process(plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join(
                 cf.plot_dir, 'pred_example_{}_val.png'.format(cf.fold)))
 
         # -------------- scheduling -----------------
         if cf.dynamic_lr_scheduling:
             scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
         else:
             for param_group in optimizer.param_groups:
                 param_group['lr'] = cf.learning_rate[epoch-1]
 
 def test(logger):
     """
     perform testing for a given fold (or hold out set). save stats in evaluator.
     """
     logger.info('starting testing model of fold {} in exp {}'.format(cf.fold, cf.exp_dir))
     net = model.net(cf, logger).cuda()
     test_predictor = Predictor(cf, net, logger, mode='test')
     test_evaluator = Evaluator(cf, logger, mode='test')
     batch_gen = data_loader.get_test_generator(cf, logger)
     test_results_list = test_predictor.predict_test_set(batch_gen, return_results=True)
     test_evaluator.evaluate_predictions(test_results_list)
     test_evaluator.score_test_df()
 
 
 if __name__ == '__main__':
     stime = time.time()
 
     parser = argparse.ArgumentParser()
     parser.add_argument('-m', '--mode', type=str,  default='train_test',
                         help='one out of: train / test / train_test / analysis / create_exp')
     parser.add_argument('-f','--folds', nargs='+', type=int, default=None,
                         help='None runs over all folds in CV. otherwise specify list of folds.')
     parser.add_argument('--exp_dir', type=str, default='/path/to/experiment/directory',
                         help='path to experiment dir. will be created if non existent.')
     parser.add_argument('--server_env', default=False, action='store_true',
                         help='change IO settings to deploy models on a cluster.')
     parser.add_argument('--data_dest', type=str, default=None, help="path to final data folder if different from config.")
     parser.add_argument('--use_stored_settings', default=False, action='store_true',
                         help='load configs from existing exp_dir instead of source dir. always done for testing, '
                              'but can be set to true to do the same for training. useful in job scheduler environment, '
                              'where source code might change before the job actually runs.')
     parser.add_argument('--resume', action="store_true", default=False,
                         help='if given, resume from checkpoint(s) of the specified folds.')
     parser.add_argument('--exp_source', type=str, default='experiments/toy_exp',
                         help='specifies, from which source experiment to load configs and data_loader.')
     parser.add_argument('--no_benchmark', action='store_true', help="Do not use cudnn.benchmark.")
+    parser.add_argument('--cuda_device', type=int, default=0, help="Index of CUDA device to use.")
     parser.add_argument('-d', '--dev', default=False, action='store_true', help="development mode: shorten everything")
 
     args = parser.parse_args()
     folds = args.folds
 
     torch.backends.cudnn.benchmark = not args.no_benchmark
 
     if args.mode == 'train' or args.mode == 'train_test':
 
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, args.use_stored_settings)
         if args.dev:
             folds = [0,1]
             cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 1, 0, 2
             cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 5, 1, 1
             cf.test_n_epochs =  cf.save_n_models
             cf.max_test_patients = 2
 
         cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         logger.info("cudnn benchmark: {}, deterministic: {}.".format(torch.backends.cudnn.benchmark,
                                                                      torch.backends.cudnn.deterministic))
+        logger.info("sending tensors to CUDA device: {}.".format(torch.cuda.get_device_name(args.cuda_device)))
         data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
-        for fold in folds:
-            cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
-            cf.fold = fold
-            cf.resume = args.resume
-            if not os.path.exists(cf.fold_dir):
-                os.mkdir(cf.fold_dir)
-            logger.set_logfile(fold=fold)
-            train(logger)
-            cf.resume = False
-            if args.mode == 'train_test':
-                test(logger)
+        with torch.cuda.device(args.cuda_device):
+            for fold in folds:
+                cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
+                cf.fold = fold
+                cf.resume = args.resume
+                if not os.path.exists(cf.fold_dir):
+                    os.mkdir(cf.fold_dir)
+                logger.set_logfile(fold=fold)
+                train(logger)
+                cf.resume = False
+                if args.mode == 'train_test':
+                    test(logger)
 
     elif args.mode == 'test':
 
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True)
         if args.dev:
             folds = [0,1]
             cf.test_n_epochs = 2; cf.max_test_patients = 2
 
         cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
-        for fold in folds:
-            cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
-            cf.fold = fold
-            logger.set_logfile(fold=fold)
-            test(logger)
+        with torch.cuda.device(args.cuda_device):
+            for fold in folds:
+                cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
+                cf.fold = fold
+                logger.set_logfile(fold=fold)
+                test(logger)
 
 
     # load raw predictions saved by predictor during testing, run aggregation algorithms and evaluation.
     elif args.mode == 'analysis':
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True)
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
 
         if args.dev:
             cf.test_n_epochs = 2
 
         if cf.hold_out_test_set and cf.ensemble_folds:
             # create and save (unevaluated) predictions across all folds
             predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
             results_list = predictor.load_saved_predictions(apply_wbc=True)
             utils.create_csv_output([(res_dict["boxes"], pid) for res_dict, pid in results_list], cf, logger)
             logger.info('starting evaluation...')
             cf.fold = 'overall_hold_out'
             evaluator = Evaluator(cf, logger, mode='test')
             evaluator.evaluate_predictions(results_list)
             evaluator.score_test_df()
 
         else:
             fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if
                          os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")])
             if folds is None:
                 folds = range(cf.n_cv_splits)
             for fold in folds:
                 cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
                 cf.fold = fold
                 logger.set_logfile(fold=fold)
                 if cf.fold_dir in fold_dirs:
                     predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
                     results_list = predictor.load_saved_predictions(apply_wbc=True)
                     logger.info('starting evaluation...')
                     evaluator = Evaluator(cf, logger, mode='test')
                     evaluator.evaluate_predictions(results_list)
                     evaluator.score_test_df()
                 else:
                     logger.info("Skipping fold {} since no model parameters found.".format(fold))
 
     # create experiment folder and copy scripts without starting job.
     # useful for cloud deployment where configs might change before job actually runs.
     elif args.mode == 'create_exp':
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, use_stored_settings=False)
         logger = utils.get_logger(cf.exp_dir)
         logger.info('created experiment directory at {}'.format(cf.exp_dir))
 
     else:
         raise RuntimeError('mode specified in args is not implemented...')
 
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t))
     del logger
\ No newline at end of file
diff --git a/experiments/toy_exp/generate_toys.py b/experiments/toy_exp/generate_toys.py
index fa278c9..53d3a85 100644
--- a/experiments/toy_exp/generate_toys.py
+++ b/experiments/toy_exp/generate_toys.py
@@ -1,138 +1,138 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os, time
 import numpy as np
 import pandas as pd
 import pickle
 import argparse
 from multiprocessing import Pool
 
 DO_MP = True
 
 
 def create_image(out_dir, six, foreground_margin, class_diameters, mode, noisy_bg):
 
     print('\rprocessing {} {}'.format(out_dir, six), end="", flush=True)
 
     img = np.random.rand(320, 320) if noisy_bg else np.zeros((320, 320))
     seg = np.zeros((320, 320)).astype('uint8')
     center_x = np.random.randint(foreground_margin, img.shape[0] - foreground_margin)
     center_y = np.random.randint(foreground_margin, img.shape[1] - foreground_margin)
     class_id = np.random.randint(0, 2)
 
     for y in range(img.shape[0]):
         for x in range(img.shape[0]):
             if ((x - center_x) ** 2 + (y - center_y) ** 2 - class_diameters[class_id] ** 2) < 0:
                 img[y][x] += 0.2
                 seg[y][x] = 1
 
     if 'donuts' in mode:
         hole_diameter = 4
         if class_id == 1:
             for y in range(img.shape[0]):
                 for x in range(img.shape[0]):
                     if ((x - center_x) ** 2 + (y - center_y) ** 2 - hole_diameter ** 2) < 0:
                         img[y][x] -= 0.2
                         if mode == 'donuts_shape':
                             seg[y][x] = 0
 
     out = np.concatenate((img[None], seg[None]))
     out_path = os.path.join(out_dir, '{}.npy'.format(six))
     np.save(out_path, out)
 
     with open(os.path.join(out_dir, 'meta_info_{}.pickle'.format(six)), 'wb') as handle:
         pickle.dump([out_path, class_id, str(six)], handle)
 
 
 def generate_dataset(cf, exp_name, n_train_images, n_test_images, mode, class_diameters=(20, 20), noisy_bg=False):
 
     train_dir = os.path.join(cf.root_dir, exp_name, 'train')
     test_dir = os.path.join(cf.root_dir, exp_name, 'test')
     if os.path.isdir(train_dir) or os.path.isdir(test_dir):
         raise Exception("A dataset directory already exists at {}. ".format(cf.root_dir)+
                         "Please make sure to generate data in an empty or new directory.")
     os.makedirs(train_dir, exist_ok=False)
     os.makedirs(test_dir, exist_ok=False)
 
     # enforced distance between object center and image edge.
     foreground_margin = int(np.ceil(np.max(class_diameters) / 1.25))
 
     info = []
     info += [[train_dir, six, foreground_margin, class_diameters, mode, noisy_bg] for six in range(n_train_images)]
     info += [[test_dir, six, foreground_margin, class_diameters, mode, noisy_bg] for six in range(n_test_images)]
 
     print('starting creation of {} images'.format(len(info)))
     if DO_MP:
         pool = Pool(processes=os.cpu_count()-1)
         pool.starmap(create_image, info, chunksize=1)
         pool.close()
         pool.join()
     else:
         for inputs in info:
             create_image(*inputs)
     print()
     aggregate_meta_info(train_dir)
     aggregate_meta_info(test_dir)
 
 
 def aggregate_meta_info(exp_dir):
 
     files = [os.path.join(exp_dir, f) for f in os.listdir(exp_dir) if 'meta_info' in f]
     df = pd.DataFrame(columns=['path', 'class_id', 'pid'])
     for f in files:
         with open(f, 'rb') as handle:
             df.loc[len(df)] = pickle.load(handle)
 
     df.to_pickle(os.path.join(exp_dir, 'info_df.pickle'))
     print("aggregated meta info to df with length", len(df))
 
 
 if __name__ == '__main__':
     stime = time.time()
     import sys
     sys.path.append("../..")
     import utils.exp_utils as utils
 
     parser = argparse.ArgumentParser()
     mode_choices = ['donuts_shape', 'donuts_pattern', 'circles_scale']
     parser.add_argument('-m', '--modes', nargs='+', type=str, default=mode_choices, choices=mode_choices)
     parser.add_argument('--noise', action='store_true', help="if given, add noise to the sample bg.")
-    parser.add_argument('--n_train', type=int, default=1500, help="Nr. of train images to generate.")
+    parser.add_argument('--n_train', type=int, default=2500, help="Nr. of train images to generate.")
     parser.add_argument('--n_test', type=int, default=1000, help="Nr. of test images to generate.")
     args = parser.parse_args()
 
 
     cf_file = utils.import_module("cf", "configs.py")
     cf = cf_file.configs()
 
     class_diameters = {
         'donuts_shape': (20, 20),
         'donuts_pattern': (20, 20),
         'circles_scale': (19, 20)
     }
 
     for mode in args.modes:
         generate_dataset(cf, mode + ("_noise" if args.noise else ""), n_train_images=args.n_train, n_test_images=args.n_test, mode=mode,
                             class_diameters=class_diameters[mode], noisy_bg=args.noise)
 
 
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     print("{} total runtime: {}".format(os.path.split(__file__)[1], t))
 
 
diff --git a/plotting.py b/plotting.py
index 023e739..a5b3565 100644
--- a/plotting.py
+++ b/plotting.py
@@ -1,275 +1,295 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
 import numpy as np
 import os
 from copy import deepcopy
 
 
+def suppress_axes_lines(ax):
+    """
+    :param ax: pyplot axes object
+    """
+    ax.axes.get_xaxis().set_ticks([])
+    ax.axes.get_yaxis().set_ticks([])
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['bottom'].set_visible(False)
+    ax.spines['left'].set_visible(False)
+
+    return
+
 def plot_batch_prediction(batch, results_dict, cf, outfile=None, suptitle=None):
     """
     plot the input images, ground truth annotations, and output predictions of a batch. If 3D batch, plots a 2D projection
     of one randomly sampled element (patient) in the batch. Since plotting all slices of patient volume blows up costs of
     time and space, only a section containing a randomly sampled ground truth annotation is plotted.
     :param batch: dict with keys: 'data' (input image), 'seg' (pixelwise annotations), 'pid'
     :param results_dict: list over batch element. Each element is a list of boxes (prediction and ground truth),
     where every box is a dictionary containing box_coords, box_score and box_type.
     """
     if outfile is None:
         outfile = os.path.join(cf.plot_dir, 'pred_example_{}.png'.format(cf.fold))
 
     data = batch['data']
     segs = batch['seg']
     pids = batch['pid']
     # for 3D, repeat pid over batch elements.
     if len(set(pids)) == 1:
         pids = [pids] * data.shape[0]
 
     seg_preds = results_dict['seg_preds']
     roi_results = deepcopy(results_dict['boxes'])
 
     # Randomly sampled one patient of batch and project data into 2D slices for plotting.
     if cf.dim == 3:
         patient_ix = np.random.choice(data.shape[0])
         data = np.transpose(data[patient_ix], axes=(3, 0, 1, 2))
 
         # select interesting foreground section to plot.
         gt_boxes = [box['box_coords'] for box in roi_results[patient_ix] if box['box_type'] == 'gt']
         if len(gt_boxes) > 0:
             z_cuts = [np.max((int(gt_boxes[0][4]) - 5, 0)), np.min((int(gt_boxes[0][5]) + 5, data.shape[0]))]
         else:
             z_cuts = [data.shape[0]//2 - 5, int(data.shape[0]//2 + np.min([10, data.shape[0]//2]))]
         p_roi_results = roi_results[patient_ix]
         roi_results = [[] for _ in range(data.shape[0])]
 
         # iterate over cubes and spread across slices.
         for box in p_roi_results:
             b = box['box_coords']
             # dismiss negative anchor slices.
             slices = np.round(np.unique(np.clip(np.arange(b[4], b[5] + 1), 0, data.shape[0]-1)))
             for s in slices:
                 roi_results[int(s)].append(box)
                 roi_results[int(s)][-1]['box_coords'] = b[:4]
 
         roi_results = roi_results[z_cuts[0]: z_cuts[1]]
         data = data[z_cuts[0]: z_cuts[1]]
         segs = np.transpose(segs[patient_ix], axes=(3, 0, 1, 2))[z_cuts[0]: z_cuts[1]]
         seg_preds = np.transpose(seg_preds[patient_ix], axes=(3, 0, 1, 2))[z_cuts[0]: z_cuts[1]]
         pids = [pids[patient_ix]] * data.shape[0]
 
     try:
         # all dimensions except for the 'channel-dimension' are required to match
         for i in [0, 2, 3]:
             assert data.shape[i] == segs.shape[i] == seg_preds.shape[i]
     except:
         raise Warning('Shapes of arrays to plot not in agreement!'
                       'Shapes {} vs. {} vs {}'.format(data.shape, segs.shape, seg_preds.shape))
 
 
     show_arrays = np.concatenate([data, segs, seg_preds, data[:, 0][:, None]], axis=1).astype(float)
     approx_figshape = (4 * show_arrays.shape[0], 4 * show_arrays.shape[1])
     fig = plt.figure(figsize=approx_figshape)
     gs = gridspec.GridSpec(show_arrays.shape[1] + 1, show_arrays.shape[0])
     gs.update(wspace=0.1, hspace=0.1)
     for b in range(show_arrays.shape[0]):
         for m in range(show_arrays.shape[1]):
 
             ax = plt.subplot(gs[m, b])
-            ax.axis('off')
+            suppress_axes_lines(ax)
             if m < show_arrays.shape[1]:
                 arr = show_arrays[b, m]
 
             if m < data.shape[1] or m == show_arrays.shape[1] - 1:
+                if b == 0:
+                    ax.set_ylabel("Input" + (" + GT & Pred Box" if m == show_arrays.shape[1] - 1 else ""))
                 cmap = 'gray'
                 vmin = None
                 vmax = None
             else:
                 cmap = None
                 vmin = 0
                 vmax = cf.num_seg_classes - 1
 
             if m == 0:
                 plt.title('{}'.format(pids[b][:10]), fontsize=20)
 
             plt.imshow(arr, cmap=cmap, vmin=vmin, vmax=vmax)
             if m >= (data.shape[1]):
+                if b == 0:
+                    if m == data.shape[1]:
+                        ax.set_ylabel("GT Box & Seg")
+                    if m == data.shape[1]+1:
+                        ax.set_ylabel("GT Box + Pred Seg & Box")
                 for box in roi_results[b]:
                     if box['box_type'] != 'patient_tn_box': # don't plot true negative dummy boxes.
                         coords = box['box_coords']
                         if box['box_type'] == 'det':
                             # dont plot background preds or low confidence boxes.
                             if box['box_pred_class_id'] > 0 and box['box_score'] > 0.1:
                                 plot_text = True
                                 score = np.max(box['box_score'])
                                 score_text = '{}|{:.0f}'.format(box['box_pred_class_id'], score*100)
                                 # if prob detection: plot only boxes from correct sampling instance.
                                 if 'sample_id' in box.keys() and int(box['sample_id']) != m - data.shape[1] - 2:
                                         continue
                                 # if prob detection: plot reconstructed boxes only in corresponding line.
                                 if not 'sample_id' in box.keys() and  m != data.shape[1] + 1:
                                     continue
 
                                 score_font_size = 7
                                 text_color = 'w'
                                 text_x = coords[1] + 10*(box['box_pred_class_id'] -1) #avoid overlap of scores in plot.
                                 text_y = coords[2] + 5
                             else:
                                 continue
                         elif box['box_type'] == 'gt':
                             plot_text = True
                             score_text = int(box['box_label'])
                             score_font_size = 7
                             text_color = 'r'
                             text_x = coords[1]
                             text_y = coords[0] - 1
                         else:
                             plot_text = False
 
                         color_var = 'extra_usage' if 'extra_usage' in list(box.keys()) else 'box_type'
                         color = cf.box_color_palette[box[color_var]]
                         plt.plot([coords[1], coords[3]], [coords[0], coords[0]], color=color, linewidth=1, alpha=1) # up
                         plt.plot([coords[1], coords[3]], [coords[2], coords[2]], color=color, linewidth=1, alpha=1) # down
                         plt.plot([coords[1], coords[1]], [coords[0], coords[2]], color=color, linewidth=1, alpha=1) # left
                         plt.plot([coords[3], coords[3]], [coords[0], coords[2]], color=color, linewidth=1, alpha=1) # right
                         if plot_text:
                             plt.text(text_x, text_y, score_text, fontsize=score_font_size, color=text_color)
 
     if suptitle is not None:
         plt.suptitle(suptitle, fontsize=22)
 
     try:
         plt.savefig(outfile)
     except:
         raise Warning('failed to save plot.')
     plt.close(fig)
 
 
 
 class TrainingPlot_2Panel():
     # todo remove since replaced by tensorboard?
 
     def __init__(self, cf):
 
         self.file_name = cf.plot_dir + '/monitor_{}'.format(cf.fold)
         self.exp_name = cf.fold_dir
         self.do_validation = cf.do_validation
         self.separate_values_dict = cf.assign_values_to_extra_figure
         self.figure_list = []
         for n in range(cf.n_monitoring_figures):
             self.figure_list.append(plt.figure(figsize=(10, 6)))
             self.figure_list[-1].ax1 = plt.subplot(111)
             self.figure_list[-1].ax1.set_xlabel('epochs')
             self.figure_list[-1].ax1.set_ylabel('loss / metrics')
             self.figure_list[-1].ax1.set_xlim(0, cf.num_epochs)
             self.figure_list[-1].ax1.grid()
 
         self.figure_list[0].ax1.set_ylim(0, 1.5)
         self.color_palette = ['b', 'c', 'r', 'purple', 'm', 'y', 'k', 'tab:gray']
 
     def update_and_save(self, metrics, epoch):
 
         for figure_ix in range(len(self.figure_list)):
             fig = self.figure_list[figure_ix]
             detection_monitoring_plot(fig.ax1, metrics, self.exp_name, self.color_palette, epoch, figure_ix,
                                       self.separate_values_dict,
                                       self.do_validation)
             fig.savefig(self.file_name + '_{}'.format(figure_ix))
 
 
 def detection_monitoring_plot(ax1, metrics, exp_name, color_palette, epoch, figure_ix, separate_values_dict, do_validation):
     # todo remove since replaced by tensorboard?
     monitor_values_keys = metrics['train']['monitor_values'][1][0].keys()
     separate_values = [v for fig_ix in separate_values_dict.values() for v in fig_ix]
     if figure_ix == 0:
         plot_keys = [ii for ii in monitor_values_keys if ii not in separate_values]
         plot_keys += [k for k in metrics['train'].keys() if k != 'monitor_values']
     else:
         plot_keys = separate_values_dict[figure_ix]
 
 
     x = np.arange(1, epoch + 1)
 
     for kix, pk in enumerate(plot_keys):
         if pk in metrics['train'].keys():
             y_train = metrics['train'][pk][1:]
             if do_validation:
                 y_val = metrics['val'][pk][1:]
         else:
             y_train = [np.mean([er[pk] for er in metrics['train']['monitor_values'][e]]) for e in x]
             if do_validation:
                 y_val = [np.mean([er[pk] for er in metrics['val']['monitor_values'][e]]) for e in x]
 
         ax1.plot(x, y_train, label='train_{}'.format(pk), linestyle='--', color=color_palette[kix])
         if do_validation:
             ax1.plot(x, y_val, label='val_{}'.format(pk), linestyle='-', color=color_palette[kix])
 
     if epoch == 1:
         box = ax1.get_position()
         ax1.set_position([box.x0, box.y0, box.width * 0.8, box.height])
         ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5))
         ax1.set_title(exp_name)
 
 
 def plot_prediction_hist(label_list, pred_list, type_list, outfile):
     """
     plot histogram of predictions for a specific class.
     :param label_list: list of 1s and 0s specifying whether prediction is a true positive match (1) or a false positive (0).
     False negatives (missed ground truth objects) are artificially added predictions with score 0 and label 1.
     :param pred_list: list of prediction-scores.
     :param type_list: list of prediction-types for stastic-info in title.
     """
     preds = np.array(pred_list)
     labels = np.array(label_list)
     title = outfile.split('/')[-1] + ' count:{}'.format(len(label_list))
     plt.figure()
     plt.yscale('log')
     if 0 in labels:
         plt.hist(preds[labels == 0], alpha=0.3, color='g', range=(0, 1), bins=50, label='false pos.')
     if 1 in labels:
         plt.hist(preds[labels == 1], alpha=0.3, color='b', range=(0, 1), bins=50, label='true pos. (false neg. @ score=0)')
 
     if type_list is not None:
         fp_count = type_list.count('det_fp')
         fn_count = type_list.count('det_fn')
         tp_count = type_list.count('det_tp')
         pos_count = fn_count + tp_count
         title += ' tp:{} fp:{} fn:{} pos:{}'. format(tp_count, fp_count, fn_count, pos_count)
 
     plt.legend()
     plt.title(title)
     plt.xlabel('confidence score')
     plt.ylabel('log n')
     plt.savefig(outfile)
     plt.close()
 
 
 def plot_stat_curves(stats, outfile):
 
     for c in ['roc', 'prc']:
         plt.figure()
         for s in stats:
             if s[c] is not None:
                 plt.plot(s[c][0], s[c][1], label=s['name'] + '_' + c)
         plt.title(outfile.split('/')[-1] + '_' + c)
         plt.legend(loc=3 if c == 'prc' else 4)
         plt.xlabel('precision' if c == 'prc' else '1-spec.')
         plt.ylabel('recall')
         plt.savefig(outfile + '_' + c)
         plt.close()
diff --git a/predictor.py b/predictor.py
index 7908353..a68b405 100644
--- a/predictor.py
+++ b/predictor.py
@@ -1,869 +1,876 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os
 import numpy as np
 import torch
 from scipy.stats import norm
 from collections import OrderedDict
 from multiprocessing import Pool
 import pickle
 import pandas as pd
 
+import utils.exp_utils as utils
 from plotting import plot_batch_prediction
 
 
 class Predictor:
     """
     Prediction pipeline:
     - receives a patched patient image (n_patches, c, y, x, (z)) from patient data loader.
     - forwards patches through model in chunks of batch_size. (method: batch_tiling_forward)
     - unmolds predictions (boxes and segmentations) to original patient coordinates. (method: spatial_tiling_forward)
 
     Ensembling (mode == 'test'):
     - for inference, forwards 4 mirrored versions of image to through model and unmolds predictions afterwards
       accordingly (method: data_aug_forward)
     - for inference, loads multiple parameter-sets of the trained model corresponding to different epochs. for each
       parameter-set loops over entire test set, runs prediction pipeline for each patient. (method: predict_test_set)
 
     Consolidation of predictions:
     - consolidates a patient's predictions (boxes, segmentations) collected over patches, data_aug- and temporal ensembling,
       performs clustering and weighted averaging (external function: apply_wbc_to_patient) to obtain consistent outptus.
     - for 2D networks, consolidates box predictions to 3D cubes via clustering (adaption of non-maximum surpression).
       (external function: merge_2D_to_3D_preds_per_patient)
 
     Ground truth handling:
     - dissmisses any ground truth boxes returned by the model (happens in validation mode, patch-based groundtruth)
     - if provided by data loader, adds 3D ground truth to the final predictions to be passed to the evaluator.
     """
     def __init__(self, cf, net, logger, mode):
 
         self.cf = cf
         self.logger = logger
 
         # mode is 'val' for patient-based validation/monitoring and 'test' for inference.
         self.mode = mode
 
         # model instance. In validation mode, contains parameters of current epoch.
         self.net = net
 
         # rank of current epoch loaded (for temporal averaging). this info is added to each prediction,
         # for correct weighting during consolidation.
         self.rank_ix = '0'
 
         # number of ensembled models. used to calculate the number of expected predictions per position
         # during consolidation of predictions. Default is 1 (no ensembling, e.g. in validation).
         self.n_ens = 1
 
         if self.mode == 'test':
             try:
                 self.epoch_ranking = np.load(os.path.join(self.cf.fold_dir, 'epoch_ranking.npy'))[:cf.test_n_epochs]
             except:
                 raise RuntimeError('no epoch ranking file in fold directory. '
                                    'seems like you are trying to run testing without prior training...')
             self.n_ens = cf.test_n_epochs
             if self.cf.test_aug:
                 self.n_ens *= 4
 
             self.example_plot_dir = os.path.join(cf.test_dir, "example_plots")
             os.makedirs(self.example_plot_dir, exist_ok=True)
 
 
     def predict_patient(self, batch):
         """
         predicts one patient.
         called either directly via loop over validation set in exec.py (mode=='val')
         or from self.predict_test_set (mode=='test).
         in val mode:  adds 3D ground truth info to predictions and runs consolidation and 2Dto3D merging of predictions.
         in test mode: returns raw predictions (ground truth addition, consolidation, 2D to 3D merging are
                       done in self.predict_test_set, because patient predictions across several epochs might be needed
                       to be collected first, in case of temporal ensembling).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - losses (only in validation mode)
         """
         #self.logger.info('\revaluating patient {} for fold {} '.format(batch['pid'], self.cf.fold))
         print('\revaluating patient {} for fold {} '.format(batch['pid'], self.cf.fold), end="", flush=True)
 
         # True if patient is provided in patches and predictions need to be tiled.
         self.patched_patient = True if 'patch_crop_coords' in list(batch.keys()) else False
 
         # forward batch through prediction pipeline.
         results_dict = self.data_aug_forward(batch)
 
         if self.mode == 'val':
             for b in range(batch['patient_bb_target'].shape[0]):
                 for t in range(len(batch['patient_bb_target'][b])):
                     results_dict['boxes'][b].append({'box_coords': batch['patient_bb_target'][b][t],
                                                      'box_label': batch['patient_roi_labels'][b][t],
                                                      'box_type': 'gt'})
 
             if self.patched_patient:
                 wcs_input = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.wcs_iou, self.n_ens]
                 results_dict['boxes'] = apply_wbc_to_patient(wcs_input)[0]
 
             if self.cf.merge_2D_to_3D_preds:
                 merge_dims_inputs = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.merge_3D_iou]
                 results_dict['boxes'] = merge_2D_to_3D_preds_per_patient(merge_dims_inputs)[0]
 
         return results_dict
 
 
     def predict_test_set(self, batch_gen, return_results=True):
         """
         wrapper around test method, which loads multiple (or one) epoch parameters (temporal ensembling), loops through
         the test set and collects predictions per patient. Also flattens the results per patient and epoch
         and adds optional ground truth boxes for evaluation. Saves out the raw result list for later analysis and
         optionally consolidates and returns predictions immediately.
         :return: (optionally) list_of_results_per_patient: list over patient results. each entry is a dict with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation.
         """
         dict_of_patient_results = OrderedDict()
 
         # get paths of all parameter sets to be loaded for temporal ensembling. (or just one for no temp. ensembling).
         weight_paths = [os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch), 'params.pth') for epoch in
                         self.epoch_ranking]
         n_test_plots = min(batch_gen['n_test'], 1)
 
         for rank_ix, weight_path in enumerate(weight_paths):
 
             self.logger.info(('tmp ensembling over rank_ix:{} epoch:{}'.format(rank_ix, weight_path)))
             self.net.load_state_dict(torch.load(weight_path))
             self.net.eval()
             self.rank_ix = str(rank_ix)  # get string of current rank for unique patch ids.
             plot_batches = np.random.choice(np.arange(batch_gen['n_test']), size=n_test_plots, replace=False)
 
             with torch.no_grad():
                 for i in range(batch_gen['n_test']):
 
                     batch = next(batch_gen['test'])
 
                     # store batch info in patient entry of results dict.
                     if rank_ix == 0:
                         dict_of_patient_results[batch['pid']] = {}
                         dict_of_patient_results[batch['pid']]['results_dicts'] = []
                         dict_of_patient_results[batch['pid']]['patient_bb_target'] = batch['patient_bb_target']
                         dict_of_patient_results[batch['pid']]['patient_roi_labels'] = batch['patient_roi_labels']
 
                     # call prediction pipeline and store results in dict.
                     results_dict = self.predict_patient(batch)
                     dict_of_patient_results[batch['pid']]['results_dicts'].append({"boxes": results_dict['boxes']})
 
                     if i in plot_batches and (not self.patched_patient or 'patient_data' in batch.keys()):
                         try:
                             # view qualitative results of random test case
                             out_file = os.path.join(self.example_plot_dir,
                                                     'batch_example_test_{}_rank_{}.png'.format(self.cf.fold, rank_ix))
-                            plot_batch_prediction(batch, results_dict, self.cf, outfile=out_file)
+                            plot_results = results_dict.copy()
+                            # seg preds of test augs are included separately. for viewing only show aug 0 (merging
+                            # would need multiple changes, incl in every model).
+                            if plot_results["seg_preds"].shape[1] > 1:
+                                plot_results["seg_preds"] = results_dict['seg_preds'][:,[0]]
+                            utils.split_off_process(plot_batch_prediction, batch, results_dict, self.cf,
+                                                    outfile=out_file)
                         except Exception as e:
                             self.logger.info("WARNING: error in plotting example test batch: {}".format(e))
 
 
         self.logger.info('finished predicting test set. starting post-processing of predictions.')
         results_per_patient = []
 
         # loop over patients again to flatten results across epoch predictions.
         # if provided, add ground truth boxes for evaluation.
         for pid, p_dict in dict_of_patient_results.items():
 
             tmp_ens_list = p_dict['results_dicts']
             results_dict = {}
             # collect all boxes/seg_preds of same batch_instance over temporal instances.
             b_size = len(tmp_ens_list[0]["boxes"])
             results_dict['boxes'] = [[item for rank_dict in tmp_ens_list for item in rank_dict["boxes"][batch_instance]]
                                      for batch_instance in range(b_size)]
 
             # TODO return for instance segmentation:
             # results_dict['seg_preds'] = np.mean(results_dict['seg_preds'], 1)[:, None]
             # results_dict['seg_preds'] = np.array([[item for d in tmp_ens_list for item in d['seg_preds'][batch_instance]]
             #                                       for batch_instance in range(len(tmp_ens_list[0]['boxes']))])
 
             # add 3D ground truth boxes for evaluation.
             for b in range(p_dict['patient_bb_target'].shape[0]):
                 for t in range(len(p_dict['patient_bb_target'][b])):
                     results_dict['boxes'][b].append({'box_coords': p_dict['patient_bb_target'][b][t],
                                                      'box_label': p_dict['patient_roi_labels'][b][t],
                                                      'box_type': 'gt'})
             results_per_patient.append([results_dict, pid])
 
         # save out raw predictions.
         out_string = 'raw_pred_boxes_hold_out_list' if self.cf.hold_out_test_set else 'raw_pred_boxes_list'
         with open(os.path.join(self.cf.fold_dir, '{}.pickle'.format(out_string)), 'wb') as handle:
             pickle.dump(results_per_patient, handle)
 
         if return_results:
             final_patient_box_results = [(res_dict["boxes"], pid) for res_dict, pid in results_per_patient]
             # consolidate predictions.
             self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format(
                 self.cf.wcs_iou, self.n_ens))
             pool = Pool(processes=6)
             mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, self.n_ens] for ii in final_patient_box_results]
             final_patient_box_results = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
 
             # merge 2D boxes to 3D cubes. (if model predicts 2D but evaluation is run in 3D)
             if self.cf.merge_2D_to_3D_preds:
                 self.logger.info('applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
                 pool = Pool(processes=6)
                 mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in final_patient_box_results]
                 final_patient_box_results = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
                 pool.close()
                 pool.join()
 
             # final_patient_box_results holds [avg_boxes, pid] if wbc
             for ix in range(len(results_per_patient)):
                 assert results_per_patient[ix][1] == final_patient_box_results[ix][1], "should be same pid"
                 results_per_patient[ix][0]["boxes"] = final_patient_box_results[ix][0]
 
             return results_per_patient
 
 
     def load_saved_predictions(self, apply_wbc=False):
         """
         loads raw predictions saved by self.predict_test_set. consolidates and merges 2D boxes to 3D cubes for evaluation.
         (if model predicts 2D but evaluation is run in 3D)
         :return: (optionally) results_list: list over patient results. each entry is a dict with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation.
         """
 
         # load predictions for a single test-set fold.
         results_file = 'raw_pred_boxes_hold_out_list.pickle' if self.cf.hold_out_test_set else 'raw_pred_boxes_list.pickle'
         if not self.cf.hold_out_test_set or not self.cf.ensemble_folds:
             with open(os.path.join(self.cf.fold_dir, results_file), 'rb') as handle:
                 results_list = pickle.load(handle)
             box_results_list = [(res_dict["boxes"], pid) for res_dict, pid in results_list]
             da_factor = 4 if self.cf.test_aug else 1
             n_ens = self.cf.test_n_epochs * da_factor
             self.logger.info('loaded raw test set predictions with n_patients = {} and n_ens = {}'.format(
                 len(results_list), n_ens))
 
         # if hold out test set was perdicted, aggregate predictions of all trained models
         # corresponding to all CV-folds and flatten them.
         else:
             self.logger.info("loading saved predictions of hold-out test set and ensembling over folds.")
             fold_dirs = sorted([os.path.join(self.cf.exp_dir, f) for f in os.listdir(self.cf.exp_dir) if
                                 os.path.isdir(os.path.join(self.cf.exp_dir, f)) and f.startswith("fold")])
 
             results_list = []
             folds_loaded = 0
             for fold in range(self.cf.n_cv_splits):
                 fold_dir = os.path.join(self.cf.exp_dir, 'fold_{}'.format(fold))
                 if fold_dir in fold_dirs:
                     with open(os.path.join(fold_dir, results_file), 'rb') as handle:
                         fold_list = pickle.load(handle)
                         results_list += fold_list
                         folds_loaded += 1
                 else:
                     self.logger.info("Skipping fold {} since no saved predictions found.".format(fold))
             box_results_list = []
             for res_dict, pid in results_list: #without filtering gt out:
                 box_results_list.append((res_dict['boxes'], pid))
 
             da_factor = 4 if self.cf.test_aug else 1
             n_ens = self.cf.test_n_epochs * da_factor * folds_loaded
 
         # consolidate predictions.
         if apply_wbc:
             self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format(
                 self.cf.wcs_iou, n_ens))
             pool = Pool(processes=6)
             mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, n_ens] for ii in box_results_list]
             box_results_list = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
 
         # merge 2D box predictions to 3D cubes (if model predicts 2D but evaluation is run in 3D)
         if self.cf.merge_2D_to_3D_preds:
             self.logger.info(
                 'applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
             pool = Pool(processes=6)
             mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in box_results_list]
             box_results_list = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
 
 
         for ix in range(len(results_list)):
             assert np.all(
                 results_list[ix][1] == box_results_list[ix][1]), "pid mismatch between loaded and aggregated results"
             results_list[ix][0]["boxes"] = box_results_list[ix][0]
 
         return results_list  # holds (results_dict, pid)
 
 
     def data_aug_forward(self, batch):
         """
         in val_mode: passes batch through to spatial_tiling method without data_aug.
         in test_mode: if cf.test_aug is set in configs, createst 4 mirrored versions of the input image,
         passes all of them to the next processing step (spatial_tiling method) and re-transforms returned predictions
         to original image version.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - losses (only in validation mode)
         """
         patch_crops = batch['patch_crop_coords'] if self.patched_patient else None
         results_list = [self.spatial_tiling_forward(batch, patch_crops)]
         org_img_shape = batch['original_img_shape']
 
         if self.mode == 'test' and self.cf.test_aug:
 
             if self.patched_patient:
                 # apply mirror transformations to patch-crop coordinates, for correct tiling in spatial_tiling method.
                 mirrored_patch_crops = get_mirrored_patch_crops(patch_crops, batch['original_img_shape'])
             else:
                 mirrored_patch_crops = [None] * 3
 
             img = np.copy(batch['data'])
 
             # first mirroring: y-axis.
             batch['data'] = np.flip(img, axis=2).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[0], n_aug='1')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2]
                     coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=2)
             results_list.append(chunk_dict)
 
             # second mirroring: x-axis.
             batch['data'] = np.flip(img, axis=3).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[1], n_aug='2')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3]
                     coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=3)
             results_list.append(chunk_dict)
 
             # third mirroring: y-axis and x-axis.
             batch['data'] = np.flip(np.flip(img, axis=2), axis=3).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[2], n_aug='3')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2]
                     coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0]
                     coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3]
                     coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(np.flip(chunk_dict['seg_preds'], axis=2), axis=3).copy()
             results_list.append(chunk_dict)
 
             batch['data'] = img
 
         # aggregate all boxes/seg_preds per batch element from data_aug predictions.
         results_dict = {}
         results_dict['boxes'] = [[item for d in results_list for item in d['boxes'][batch_instance]]
                                  for batch_instance in range(org_img_shape[0])]
         results_dict['seg_preds'] = np.array([[item for d in results_list for item in d['seg_preds'][batch_instance]]
                                               for batch_instance in range(org_img_shape[0])])
         if self.mode == 'val':
             try:
                 results_dict['torch_loss'] = results_list[0]['torch_loss']
                 results_dict['class_loss'] = results_list[0]['class_loss']
             except KeyError:
                 pass
         return results_dict
 
 
     def spatial_tiling_forward(self, batch, patch_crops=None, n_aug='0'):
         """
         forwards batch to batch_tiling_forward method and receives and returns a dictionary with results.
         if patch-based prediction, the results received from batch_tiling_forward will be on a per-patch-basis.
         this method uses the provided patch_crops to re-transform all predictions to whole-image coordinates.
         Patch-origin information of all box-predictions will be needed for consolidation, hence it is stored as
         'patch_id', which is a unique string for each patch (also takes current data aug and temporal epoch instances
         into account). all box predictions get additional information about the amount overlapping patches at the
         respective position (used for consolidation).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - losses (only in validation mode)
         """
         if patch_crops is not None:
 
             patches_dict = self.batch_tiling_forward(batch)
 
             results_dict = {'boxes': [[] for _ in range(batch['original_img_shape'][0])]}
 
             # instanciate segemntation output array. Will contain averages over patch predictions.
             out_seg_preds = np.zeros(batch['original_img_shape'], dtype=np.float16)[:, 0][:, None]
             # counts patch instances per pixel-position.
             patch_overlap_map = np.zeros_like(out_seg_preds, dtype='uint8')
 
             #unmold segmentation outputs. loop over patches.
             for pix, pc in enumerate(patch_crops):
                 if self.cf.dim == 3:
                     out_seg_preds[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += patches_dict['seg_preds'][pix][None]
                     patch_overlap_map[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += 1
                 else:
                     out_seg_preds[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += patches_dict['seg_preds'][pix]
                     patch_overlap_map[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += 1
 
             # take mean in overlapping areas.
             out_seg_preds[patch_overlap_map > 0] /= patch_overlap_map[patch_overlap_map > 0]
             results_dict['seg_preds'] = out_seg_preds
 
             # unmold box outputs. loop over patches.
             for pix, pc in enumerate(patch_crops):
                 patch_boxes = patches_dict['boxes'][pix]
 
                 for box in patch_boxes:
 
                     # add unique patch id for consolidation of predictions.
                     box['patch_id'] = self.rank_ix + '_' + n_aug + '_' + str(pix)
 
                     # boxes from the edges of a patch have a lower prediction quality, than the ones at patch-centers.
                     # hence they will be downweighted for consolidation, using the 'box_patch_center_factor', which is
                     # obtained by a normal distribution over positions in the patch and average over spatial dimensions.
                     # Also the info 'box_n_overlaps' is stored for consolidation, which depicts the amount over
                     # overlapping patches at the box's position.
                     c = box['box_coords']
                     box_centers = [(c[ii] + c[ii + 2]) / 2 for ii in range(2)]
                     if self.cf.dim == 3:
                         box_centers.append((c[4] + c[5]) / 2)
                     box['box_patch_center_factor'] = np.mean(
                         [norm.pdf(bc, loc=pc, scale=pc * 0.8) * np.sqrt(2 * np.pi) * pc * 0.8 for bc, pc in
                          zip(box_centers, np.array(self.cf.patch_size) / 2)])
                     if self.cf.dim == 3:
                         c += np.array([pc[0], pc[2], pc[0], pc[2], pc[4], pc[4]])
                         int_c = [int(np.floor(ii)) if ix%2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(patch_overlap_map[:, :, int_c[1]:int_c[3], int_c[0]:int_c[2], int_c[4]:int_c[5]])
                         results_dict['boxes'][0].append(box)
                     else:
                         c += np.array([pc[0], pc[2], pc[0], pc[2]])
                         int_c = [int(np.floor(ii)) if ix % 2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(patch_overlap_map[pc[4], :, int_c[1]:int_c[3], int_c[0]:int_c[2]])
                         results_dict['boxes'][pc[4]].append(box)
 
             if self.mode == 'val':
                 try:
                     results_dict['torch_loss'] = patches_dict['torch_loss']
                     results_dict['class_loss'] = patches_dict['class_loss']
                 except KeyError:
                     pass
         # if predictions are not patch-based:
         # add patch-origin info to boxes (entire image is the same patch with overlap=1) and return results.
         else:
             results_dict = self.batch_tiling_forward(batch)
             for b in results_dict['boxes']:
                 for box in b:
                     box['box_patch_center_factor'] = 1
                     box['box_n_overlaps'] = 1
                     box['patch_id'] = self.rank_ix + '_' + n_aug
 
         return results_dict
 
 
     def batch_tiling_forward(self, batch):
         """
         calls the actual network forward method. in patch-based prediction, the batch dimension might be overladed
         with n_patches >> batch_size, which would exceed gpu memory. In this case, batches are processed in chunks of
         batch_size. validation mode calls the train method to monitor losses (returned ground truth objects are discarded).
         test mode calls the test forward method, no ground truth required / involved.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - losses (only in validation mode)
         """
         #self.logger.info('forwarding (patched) patient with shape: {}'.format(batch['data'].shape))
 
         img = batch['data']
 
         if img.shape[0] <= self.cf.batch_size:
 
             if self.mode == 'val':
                 # call training method to monitor losses
                 results_dict = self.net.train_forward(batch, is_validation=True)
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
             else:
                 results_dict = self.net.test_forward(batch, return_masks=self.cf.return_masks_in_test)
 
         else:
             split_ixs = np.split(np.arange(img.shape[0]), np.arange(img.shape[0])[::self.cf.batch_size])
             chunk_dicts = []
             for chunk_ixs in split_ixs[1:]:  # first split is elements before 0, so empty
                 b = {k: batch[k][chunk_ixs] for k in batch.keys()
                      if (isinstance(batch[k], np.ndarray) and batch[k].shape[0] == img.shape[0])}
                 if self.mode == 'val':
                     chunk_dicts += [self.net.train_forward(b, is_validation=True)]
                 else:
                     chunk_dicts += [self.net.test_forward(b, return_masks=self.cf.return_masks_in_test)]
 
 
             results_dict = {}
             # flatten out batch elements from chunks ([chunk, chunk] -> [b, b, b, b, ...])
             results_dict['boxes'] = [item for d in chunk_dicts for item in d['boxes']]
             results_dict['seg_preds'] = np.array([item for d in chunk_dicts for item in d['seg_preds']])
 
             if self.mode == 'val':
                 try:
                     # estimate metrics by mean over batch_chunks. Most similar to training metrics.
                     results_dict['torch_loss'] = torch.mean(torch.cat([d['torch_loss'] for d in chunk_dicts]))
                     results_dict['class_loss'] = np.mean([d['class_loss'] for d in chunk_dicts])
                 except KeyError:
                     # losses are not necessarily monitored
                     pass
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
 
         return results_dict
 
 
 
 def apply_wbc_to_patient(inputs):
     """
     wrapper around prediction box consolidation: weighted cluster scoring (wcs). processes a single patient.
     loops over batch elements in patient results (1 in 3D, slices in 2D) and foreground classes,
     aggregates and stores results in new list.
     :return. patient_results_list: list over batch elements. each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D
                                  predictions, and a dummy batch dimension of 1 for 3D predictions.
     :return. pid: string. patient id.
     """
     in_patient_results_list, pid, class_dict, wcs_iou, n_ens = inputs
     out_patient_results_list = [[] for _ in range(len(in_patient_results_list))]
 
     for bix, b in enumerate(in_patient_results_list):
 
         for cl in list(class_dict.keys()):
 
             boxes = [(ix, box) for ix, box in enumerate(b) if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             box_coords = np.array([b[1]['box_coords'] for b in boxes])
             box_scores = np.array([b[1]['box_score'] for b in boxes])
             box_center_factor = np.array([b[1]['box_patch_center_factor'] for b in boxes])
             box_n_overlaps = np.array([b[1]['box_n_overlaps'] for b in boxes])
             box_patch_id = np.array([b[1]['patch_id'] for b in boxes])
 
             if 0 not in box_scores.shape:
                 keep_scores, keep_coords = weighted_box_clustering(
                     np.concatenate((box_coords, box_scores[:, None], box_center_factor[:, None],
                                     box_n_overlaps[:, None]), axis=1), box_patch_id, wcs_iou, n_ens)
 
                 for boxix in range(len(keep_scores)):
                     out_patient_results_list[bix].append({'box_type': 'det', 'box_coords': keep_coords[boxix],
                                              'box_score': keep_scores[boxix], 'box_pred_class_id': cl})
 
         # add gt boxes back to new output list.
         out_patient_results_list[bix].extend([box for box in b if box['box_type'] == 'gt'])
 
     return [out_patient_results_list, pid]
 
 
 
 def merge_2D_to_3D_preds_per_patient(inputs):
     """
     wrapper around 2Dto3D merging operation. Processes a single patient. Takes 2D patient results (slices in batch dimension)
     and returns 3D patient results (dummy batch dimension of 1). Applies an adaption of Non-Maximum Surpression
     (Detailed methodology is described in nms_2to3D).
     :return. results_dict_boxes: list over batch elements (1 in 3D). each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]].
     :return. pid: string. patient id.
     """
     in_patient_results_list, pid, class_dict, merge_3D_iou = inputs
     out_patient_results_list = []
 
     for cl in list(class_dict.keys()):
         boxes, slice_ids = [], []
         # collect box predictions over batch dimension (slices) and store slice info as slice_ids.
         for bix, b in enumerate(in_patient_results_list):
             det_boxes = [(ix, box) for ix, box in enumerate(b) if
                      (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             boxes += det_boxes
             slice_ids += [bix] * len(det_boxes)
 
         box_coords = np.array([b[1]['box_coords'] for b in boxes])
         box_scores = np.array([b[1]['box_score'] for b in boxes])
         slice_ids = np.array(slice_ids)
 
         if 0 not in box_scores.shape:
             keep_ix, keep_z = nms_2to3D(
                 np.concatenate((box_coords, box_scores[:, None], slice_ids[:, None]), axis=1), merge_3D_iou)
         else:
             keep_ix, keep_z = [], []
 
         # store kept predictions in new results list and add corresponding z-dimension info to coordinates.
         for kix, kz in zip(keep_ix, keep_z):
             out_patient_results_list.append({'box_type': 'det', 'box_coords': list(box_coords[kix]) + kz,
                                              'box_score': box_scores[kix], 'box_pred_class_id': cl})
 
     gt_boxes = [box for b in in_patient_results_list for box in b if box['box_type'] == 'gt']
     if len(gt_boxes) > 0:
         assert np.all([len(box["box_coords"]) == 6 for box in gt_boxes]), "expanded preds to 3D but GT is 2D."
     out_patient_results_list += gt_boxes
 
     # add dummy batch dimension 1 for 3D.
     return [[out_patient_results_list], pid]
 
 
 
 def weighted_box_clustering(dets, box_patch_id, thresh, n_ens):
     """
     consolidates overlapping predictions resulting from patch overlaps, test data augmentations and temporal ensembling.
     clusters predictions together with iou > thresh (like in NMS). Output score and coordinate for one cluster are the
     average weighted by individual patch center factors (how trustworthy is this candidate measured by how centered
     its position the patch is) and the size of the corresponding box.
     The number of expected predictions at a position is n_data_aug * n_temp_ens * n_overlaps_at_position
     (1 prediction per unique patch). Missing predictions at a cluster position are defined as the number of unique
     patches in the cluster, which did not contribute any predict any boxes.
     :param dets: (n_dets, (y1, x1, y2, x2, (z1), (z2), scores, box_pc_facts, box_n_ovs)
     :param thresh: threshold for iou_matching.
     :param n_ens: number of models, that are ensembled. (-> number of expected predicitions per position)
     :return: keep_scores: (n_keep)  new scores of boxes to be kept.
     :return: keep_coords: (n_keep, (y1, x1, y2, x2, (z1), (z2)) new coordinates of boxes to be kept.
     """
     dim = 2 if dets.shape[1] == 7 else 3
     y1 = dets[:, 0]
     x1 = dets[:, 1]
     y2 = dets[:, 2]
     x2 = dets[:, 3]
     scores = dets[:, -3]
     box_pc_facts = dets[:, -2]
     box_n_ovs = dets[:, -1]
 
     areas = (y2 - y1 + 1) * (x2 - x1 + 1)
 
     if dim == 3:
         z1 = dets[:, 4]
         z2 = dets[:, 5]
         areas *= (z2 - z1 + 1)
 
     # order is the sorted index.  maps order to index o[1] = 24 (rank1, ix 24)
     order = scores.argsort()[::-1]
 
     keep = []
     keep_scores = []
     keep_coords = []
 
     while order.size > 0:
         i = order[0]  # higehst scoring element
         xx1 = np.maximum(x1[i], x1[order])
         yy1 = np.maximum(y1[i], y1[order])
         xx2 = np.minimum(x2[i], x2[order])
         yy2 = np.minimum(y2[i], y2[order])
 
         w = np.maximum(0.0, xx2 - xx1 + 1)
         h = np.maximum(0.0, yy2 - yy1 + 1)
         inter = w * h
 
         if dim == 3:
             zz1 = np.maximum(z1[i], z1[order])
             zz2 = np.minimum(z2[i], z2[order])
             d = np.maximum(0.0, zz2 - zz1 + 1)
             inter *= d
 
         # overall between currently highest scoring box and all boxes.
         ovr = inter / (areas[i] + areas[order] - inter)
 
         # get all the predictions that match the current box to build one cluster.
         matches = np.argwhere(ovr > thresh)
 
         match_n_ovs = box_n_ovs[order[matches]]
         match_pc_facts = box_pc_facts[order[matches]]
         match_patch_id = box_patch_id[order[matches]]
         match_ov_facts = ovr[matches]
         match_areas = areas[order[matches]]
         match_scores = scores[order[matches]]
 
         # weight all socres in cluster by patch factors, and size.
         match_score_weights = match_ov_facts * match_areas * match_pc_facts
         match_scores *= match_score_weights
 
         # for the weigted average, scores have to be divided by the number of total expected preds at the position
         # of the current cluster. 1 Prediction per patch is expected. therefore, the number of ensembled models is
         # multiplied by the mean overlaps of  patches at this position (boxes of the cluster might partly be
         # in areas of different overlaps).
         n_expected_preds = n_ens * np.mean(match_n_ovs)
 
         # the number of missing predictions is obtained as the number of patches,
         # which did not contribute any prediction to the current cluster.
         n_missing_preds = np.max((0, n_expected_preds - np.unique(match_patch_id).shape[0]))
 
         # missing preds are given the mean weighting
         # (expected prediction is the mean over all predictions in cluster).
         denom = np.sum(match_score_weights) + n_missing_preds * np.mean(match_score_weights)
 
         # compute weighted average score for the cluster
         avg_score = np.sum(match_scores) / denom
 
         # compute weighted average of coordinates for the cluster. now only take existing
         # predictions into account.
         avg_coords = [np.sum(y1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(y2[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x2[order[matches]] * match_scores) / np.sum(match_scores)]
         if dim == 3:
             avg_coords.append(np.sum(z1[order[matches]] * match_scores) / np.sum(match_scores))
             avg_coords.append(np.sum(z2[order[matches]] * match_scores) / np.sum(match_scores))
 
         # some clusters might have very low scores due to high amounts of missing predictions.
         # filter out the with a conservative threshold, to speed up evaluation.
         if avg_score > 0.01:
             keep_scores.append(avg_score)
             keep_coords.append(avg_coords)
 
         # get index of all elements that were not matched and discard all others.
         inds = np.where(ovr <= thresh)[0]
         order = order[inds]
 
     return keep_scores, keep_coords
 
 
 
 def nms_2to3D(dets, thresh):
     """
     Merges 2D boxes to 3D cubes. Therefore, boxes of all slices are projected into one slices. An adaptation of Non-maximum surpression
     is applied, where clusters are found (like in NMS) with an extra constrained, that surpressed boxes have to have 'connected'
     z-coordinates w.r.t the core slice (cluster center, highest scoring box). 'connected' z-coordinates are determined
     as the z-coordinates with predictions until the first coordinate, where no prediction was found.
 
     example: a cluster of predictions was found overlap > iou thresh in xy (like NMS). The z-coordinate of the highest
     scoring box is 50. Other predictions have 23, 46, 48, 49, 51, 52, 53, 56, 57.
     Only the coordinates connected with 50 are clustered to one cube: 48, 49, 51, 52, 53. (46 not because nothing was
     found in 47, so 47 is a 'hole', which interrupts the connection). Only the boxes corresponding to these coordinates
     are surpressed. All others are kept for building of further clusters.
 
     This algorithm works better with a certain min_confidence of predictions, because low confidence (e.g. noisy/cluttery)
     predictions can break the relatively strong assumption of defining cubes' z-boundaries at the first 'hole' in the cluster.
 
     :param dets: (n_detections, (y1, x1, y2, x2, scores, slice_id)
     :param thresh: iou matchin threshold (like in NMS).
     :return: keep: (n_keep) 1D tensor of indices to be kept.
     :return: keep_z: (n_keep, [z1, z2]) z-coordinates to be added to boxes, which are kept in order to form cubes.
     """
     y1 = dets[:, 0]
     x1 = dets[:, 1]
     y2 = dets[:, 2]
     x2 = dets[:, 3]
     scores = dets[:, -2]
     slice_id = dets[:, -1]
 
     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
     order = scores.argsort()[::-1]
 
     keep = []
     keep_z = []
 
     while order.size > 0:  # order is the sorted index.  maps order to index o[1] = 24 (rank1, ix 24)
         i = order[0]  # pop higehst scoring element
         xx1 = np.maximum(x1[i], x1[order])
         yy1 = np.maximum(y1[i], y1[order])
         xx2 = np.minimum(x2[i], x2[order])
         yy2 = np.minimum(y2[i], y2[order])
 
         w = np.maximum(0.0, xx2 - xx1 + 1)
         h = np.maximum(0.0, yy2 - yy1 + 1)
         inter = w * h
 
         ovr = inter / (areas[i] + areas[order] - inter)
         matches = np.argwhere(ovr > thresh)  # get all the elements that match the current box and have a lower score
 
         slice_ids = slice_id[order[matches]]
         core_slice = slice_id[int(i)]
         upper_wholes = [ii for ii in np.arange(core_slice, np.max(slice_ids)) if ii not in slice_ids]
         lower_wholes = [ii for ii in np.arange(np.min(slice_ids), core_slice) if ii not in slice_ids]
         max_valid_slice_id = np.min(upper_wholes) if len(upper_wholes) > 0 else np.max(slice_ids)
         min_valid_slice_id = np.max(lower_wholes) if len(lower_wholes) > 0 else np.min(slice_ids)
         z_matches = matches[(slice_ids <= max_valid_slice_id) & (slice_ids >= min_valid_slice_id)]
 
         z1 = np.min(slice_id[order[z_matches]]) - 1
         z2 = np.max(slice_id[order[z_matches]]) + 1
 
         keep.append(i)
         keep_z.append([z1, z2])
         order = np.delete(order, z_matches, axis=0)
 
     return keep, keep_z
 
 
 
 def get_mirrored_patch_crops(patch_crops, org_img_shape):
     """
     apply 3 mirrror transformations (x-axis, y-axis, x&y-axis)
     to given patch crop coordinates and return the transformed coordinates.
     Handles 2D and 3D coordinates.
     :param patch_crops: list of crops: each element is a list of coordinates for given crop [[y1, x1, ...], [y1, x1, ..]]
     :param org_img_shape: shape of patient volume used as world coordinates.
     :return: list of mirrored patch crops: lenght=3. each element is a list of transformed patch crops.
     """
     mirrored_patch_crops = []
 
     # y-axis transform.
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   ii[2], ii[3]] if len(ii) == 4 else
                                  [org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   ii[2], ii[3], ii[4], ii[5]] for ii in patch_crops])
 
     # x-axis transform.
     mirrored_patch_crops.append([[ii[0], ii[1],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2]] if len(ii) == 4 else
                                  [ii[0], ii[1],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2],
                                   ii[4], ii[5]] for ii in patch_crops])
 
     # y-axis and x-axis transform.
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2]] if len(ii) == 4 else
                                  [org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2],
                                   ii[4], ii[5]] for ii in patch_crops])
 
     return mirrored_patch_crops
 
 
 
diff --git a/utils/exp_utils.py b/utils/exp_utils.py
index a06fa95..d0f5264 100644
--- a/utils/exp_utils.py
+++ b/utils/exp_utils.py
@@ -1,482 +1,488 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 from typing import Iterable, Tuple, Any
 import sys
 import subprocess
+from multiprocessing import Process
 import os
 
 import plotting
 import importlib.util
 import pickle
 
 import logging
 from torch.utils.tensorboard import SummaryWriter
 
 from collections import OrderedDict
 import numpy as np
 import torch
 import pandas as pd
 
+def split_off_process(target, *args, daemon=False, **kwargs):
+    """Start a process that won't block parent script.
+    No join(), no return value. If daemon=False: before parent exits, it waits for this to finish.
+    """
+    p = Process(target=target, args=tuple(args), kwargs=kwargs, daemon=daemon)
+    p.start()
+    return p
 
 class CombinedLogger(object):
     """Combine console and tensorboard logger and record system metrics.
     """
 
     def __init__(self, name, log_dir, server_env=True, fold="all"):
         self.pylogger = logging.getLogger(name)
         self.tboard = SummaryWriter(log_dir=os.path.join(log_dir, "tboard"))
         self.log_dir = log_dir
         self.fold = str(fold)
         self.server_env = server_env
 
         self.pylogger.setLevel(logging.DEBUG)
         self.log_file = os.path.join(log_dir, "fold_"+self.fold, 'exec.log')
         os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
         self.pylogger.addHandler(logging.FileHandler(self.log_file))
         if not server_env:
             self.pylogger.addHandler(ColorHandler())
         else:
             self.pylogger.addHandler(logging.StreamHandler())
         self.pylogger.propagate = False
 
     def __getattr__(self, attr):
         """delegate all undefined method requests to objects of
         this class in order pylogger, tboard (first find first serve).
         E.g., combinedlogger.add_scalars(...) should trigger self.tboard.add_scalars(...)
         """
         for obj in [self.pylogger, self.tboard]:
             if attr in dir(obj):
                 return getattr(obj, attr)
         print("logger attr not found")
 
     def set_logfile(self, fold=None, log_file=None):
         if fold is not None:
             self.fold = str(fold)
         if log_file is None:
             self.log_file = os.path.join(self.log_dir, "fold_"+self.fold, 'exec.log')
         else:
             self.log_file = log_file
         os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
         for hdlr in self.pylogger.handlers:
             hdlr.close()
         self.pylogger.handlers = []
         self.pylogger.addHandler(logging.FileHandler(self.log_file))
         if not self.server_env:
             self.pylogger.addHandler(ColorHandler())
         else:
             self.pylogger.addHandler(logging.StreamHandler())
 
     def metrics2tboard(self, metrics, global_step=None, suptitle=None):
         """
         :param metrics: {'train': dataframe, 'val':df}, df as produced in
             evaluator.py.evaluate_predictions
         """
         # print("metrics", metrics)
         if global_step is None:
             global_step = len(metrics['train'][list(metrics['train'].keys())[0]]) - 1
         if suptitle is not None:
             suptitle = str(suptitle)
         else:
             suptitle = "Fold_" + str(self.fold)
 
         for key in ['train', 'val']:
             # series = {k:np.array(v[-1]) for (k,v) in metrics[key].items() if not np.isnan(v[-1]) and not 'Bin_Stats' in k}
             loss_series = {}
-            unc_series = {}
-            bin_stat_series = {}
             mon_met_series = {}
             for tag, val in metrics[key].items():
                 val = val[-1]  # maybe remove list wrapping, recording in evaluator?
                 if 'loss' in tag.lower() and not np.isnan(val):
                     loss_series["{}".format(tag)] = val
                 elif not np.isnan(val):
                     mon_met_series["{}".format(tag)] = val
 
             self.tboard.add_scalars(suptitle + "/Losses/{}".format(key), loss_series, global_step)
             self.tboard.add_scalars(suptitle + "/Monitor_Metrics/{}".format(key), mon_met_series, global_step)
         self.tboard.add_scalars(suptitle + "/Learning_Rate", metrics["lr"], global_step)
         return
 
     def __del__(self):  # otherwise might produce multiple prints e.g. in ipython console
         for hdlr in self.pylogger.handlers:
             hdlr.close()
         self.pylogger.handlers = []
         del self.pylogger
         self.tboard.flush()
         # close somehow prevents main script from exiting
         # maybe revise this issue in a later pytorch version
         #self.tboard.close()
 
 
 def get_logger(exp_dir, server_env=False):
     """
     creates logger instance. writing out info to file, to terminal and to tensorboard.
     :param exp_dir: experiment directory, where exec.log file is stored.
     :param server_env: True if operating in server environment (e.g., gpu cluster)
     :return: custom CombinedLogger instance.
     """
     log_dir = os.path.join(exp_dir, "logs")
     logger = CombinedLogger('medicaldetectiontoolkit', log_dir, server_env=server_env)
     print("Logging to {}".format(logger.log_file))
     return logger
 
 
 def prep_exp(dataset_path, exp_path, server_env, use_stored_settings=True, is_training=True):
     """
     I/O handling, creating of experiment folder structure. Also creates a snapshot of configs/model scripts and copies them to the exp_dir.
     This way the exp_dir contains all info needed to conduct an experiment, independent to changes in actual source code. Thus, training/inference of this experiment can be started at anytime. Therefore, the model script is copied back to the source code dir as tmp_model (tmp_backbone).
     Provides robust structure for cloud deployment.
     :param dataset_path: path to source code for specific data set. (e.g. medicaldetectiontoolkit/lidc_exp)
     :param exp_path: path to experiment directory.
     :param server_env: boolean flag. pass to configs script for cloud deployment.
     :param use_stored_settings: boolean flag. When starting training: If True, starts training from snapshot in existing experiment directory, else creates experiment directory on the fly using configs/model scripts from source code.
     :param is_training: boolean flag. distinguishes train vs. inference mode.
     :return:
     """
 
     if is_training:
         if use_stored_settings:
             cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py'))
             cf = cf_file.configs(server_env)
             # in this mode, previously saved model and backbone need to be found in exp dir.
             if not os.path.isfile(os.path.join(exp_path, 'model.py')) or \
                     not os.path.isfile(os.path.join(exp_path, 'backbone.py')):
                 raise Exception(
                     "Selected use_stored_settings option but no model and/or backbone source files exist in exp dir.")
             cf.model_path = os.path.join(exp_path, 'model.py')
             cf.backbone_path = os.path.join(exp_path, 'backbone.py')
         else:
             # this case overwrites settings files in exp dir, i.e., default_configs, configs, backbone, model
             os.makedirs(exp_path, exist_ok=True)
             # run training with source code info and copy snapshot of model to exp_dir for later testing (overwrite scripts if exp_dir already exists.)
             subprocess.call('cp {} {}'.format('default_configs.py', os.path.join(exp_path, 'default_configs.py')),
                             shell=True)
             subprocess.call(
                 'cp {} {}'.format(os.path.join(dataset_path, 'configs.py'), os.path.join(exp_path, 'configs.py')),
                 shell=True)
             cf_file = import_module('cf_file', os.path.join(dataset_path, 'configs.py'))
             cf = cf_file.configs(server_env)
             subprocess.call('cp {} {}'.format(cf.model_path, os.path.join(exp_path, 'model.py')), shell=True)
             subprocess.call('cp {} {}'.format(cf.backbone_path, os.path.join(exp_path, 'backbone.py')), shell=True)
             if os.path.isfile(os.path.join(exp_path, "fold_ids.pickle")):
                 subprocess.call('rm {}'.format(os.path.join(exp_path, "fold_ids.pickle")), shell=True)
 
     else:
         # testing, use model and backbone stored in exp dir.
         cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py'))
         cf = cf_file.configs(server_env)
         cf.model_path = os.path.join(exp_path, 'model.py')
         cf.backbone_path = os.path.join(exp_path, 'backbone.py')
 
 
     cf.exp_dir = exp_path
     cf.test_dir = os.path.join(cf.exp_dir, 'test')
     cf.plot_dir = os.path.join(cf.exp_dir, 'plots')
     if not os.path.exists(cf.test_dir):
         os.mkdir(cf.test_dir)
     if not os.path.exists(cf.plot_dir):
         os.mkdir(cf.plot_dir)
     cf.experiment_name = exp_path.split("/")[-1]
     cf.created_fold_id_pickle = False
 
     return cf
 
 
 
 def import_module(name, path):
     """
     correct way of importing a module dynamically in python 3.
     :param name: name given to module instance.
     :param path: path to module.
     :return: module: returned module instance.
     """
     spec = importlib.util.spec_from_file_location(name, path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
 
 
 def set_params_flag(module: torch.nn.Module, flag: Tuple[str, Any], check_overwrite: bool = True):
     """Set an attribute for all module parameters.
 
     :param flag: tuple (str attribute name : attr value)
     :param check_overwrite: if True, assert that attribute not already exists.
 
     """
     for param in module.parameters():
         if check_overwrite:
             assert not hasattr(param, flag[0]), \
                 "param {} already has attr {} (w/ val {})".format(param, flag[0], getattr(param, flag[0]))
         setattr(param, flag[0], flag[1])
     return module
 
 def parse_params_for_optim(net: torch.nn.Module, weight_decay: float = 0., exclude_from_wd: Iterable = ("norm",)):
     """Format network parameters for the optimizer.
     Convenience function to include options for group-specific settings like weight decay.
     :param net:
     :param weight_decay:
     :param exclude_from_wd: List of strings of parameter-group names to exclude from weight decay. Options: "norm", "bias".
     :return:
     """
     # pytorch implements parameter groups as dicts {'params': ...} and
     # weight decay as p.data.mul_(1 - group['lr'] * group['weight_decay'])
     norm_types = [torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d,
                   torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d,
                   torch.nn.LayerNorm, torch.nn.GroupNorm, torch.nn.SyncBatchNorm, torch.nn.LocalResponseNorm
                   ]
     level_map = {"bias": "weight",
                  "norm": "module"}
     type_map = {"norm": norm_types}
 
     exclude_from_wd = [str(name).lower() for name in exclude_from_wd]
     exclude_weight_names = [k for k, v in level_map.items() if k in exclude_from_wd and v == "weight"]
     exclude_module_types = tuple([type_ for k, v in level_map.items() if (k in exclude_from_wd and v == "module")
                                   for type_ in type_map[k]])
 
     if exclude_from_wd:
         print("excluding {} from weight decay.".format(exclude_from_wd))
 
     for module in net.modules():
         if isinstance(module, exclude_module_types):
             set_params_flag(module, ("no_wd", True))
     for param_name, param in net.named_parameters():
         if np.any([ename in param_name for ename in exclude_weight_names]):
             setattr(param, "no_wd", True)
 
     with_dec, no_dec = [], []
     for param in net.parameters():
         if hasattr(param, "no_wd") and param.no_wd == True:
             no_dec.append(param)
         else:
             with_dec.append(param)
     orig_ps = sum(p.numel() for p in net.parameters())
     with_ps = sum(p.numel() for p in with_dec)
     wo_ps = sum(p.numel() for p in no_dec)
     assert orig_ps == with_ps + wo_ps, "orig n parameters {} unequals sum of with wd {} and w/o wd {}."\
         .format(orig_ps, with_ps, wo_ps)
 
     groups = [{'params': gr, 'weight_decay': wd} for (gr, wd) in [(no_dec, 0.), (with_dec, weight_decay)] if len(gr)>0]
     return groups
 
 
 class ModelSelector:
     '''
     saves a checkpoint after each epoch as 'last_state' (can be loaded to continue interrupted training).
     saves the top-k (k=cf.save_n_models) ranked epochs. In inference, predictions of multiple epochs can be ensembled to improve performance.
     '''
 
     def __init__(self, cf, logger):
 
         self.cf = cf
         self.saved_epochs = [-1] * cf.save_n_models
         self.logger = logger
 
     def run_model_selection(self, net, optimizer, monitor_metrics, epoch):
 
         # take the mean over all selection criteria in each epoch
         non_nan_scores = np.mean(np.array([[0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][sc]] for sc in self.cf.model_selection_criteria]), 0)
         epochs_scores = [ii for ii in non_nan_scores[1:]]
         # ranking of epochs according to model_selection_criterion
         epoch_ranking = np.argsort(epochs_scores, kind="stable")[::-1] + 1 #epochs start at 1
         # if set in configs, epochs < min_save_thresh are discarded from saving process.
         epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh]
 
         # check if current epoch is among the top-k epochs.
         if epoch in epoch_ranking[:self.cf.save_n_models]:
 
             save_dir = os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch))
             if not os.path.exists(save_dir):
                 os.mkdir(save_dir)
 
             torch.save(net.state_dict(), os.path.join(save_dir, 'params.pth'))
             with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
                 pickle.dump(monitor_metrics, handle)
             # save epoch_ranking to keep info for inference.
             np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
             np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
 
             self.logger.info(
                 "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch)))
             # delete params of the epoch that just fell out of the top-k epochs.
             for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_checkpoint' in ii]:
                 if se in epoch_ranking[self.cf.save_n_models:]:
                     subprocess.call('rm -rf {}'.format(os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(se))), shell=True)
                     self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se)))
 
         state = {
             'epoch': epoch,
             'state_dict': net.state_dict(),
             'optimizer': optimizer.state_dict(),
         }
 
         # save checkpoint of current epoch.
         save_dir = os.path.join(self.cf.fold_dir, 'last_checkpoint'.format(epoch))
         if not os.path.exists(save_dir):
             os.mkdir(save_dir)
         torch.save(state, os.path.join(save_dir, 'params.pth'))
         np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
         with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
             pickle.dump(monitor_metrics, handle)
 
 
 
 def load_checkpoint(checkpoint_path, net, optimizer):
 
     checkpoint = torch.load(os.path.join(checkpoint_path, 'params.pth'))
     net.load_state_dict(checkpoint['state_dict'])
     optimizer.load_state_dict(checkpoint['optimizer'])
     with open(os.path.join(checkpoint_path, 'monitor_metrics.pickle'), 'rb') as handle:
         monitor_metrics = pickle.load(handle)
     starting_epoch = checkpoint['epoch'] + 1
     return starting_epoch, net, optimizer, monitor_metrics
 
 
 
 def prepare_monitoring(cf):
     """
     creates dictionaries, where train/val metrics are stored.
     """
     metrics = {}
     # first entry for loss dict accounts for epoch starting at 1.
     metrics['train'] = OrderedDict()
     metrics['val'] = OrderedDict()
     metric_classes = []
     if 'rois' in cf.report_score_level:
         metric_classes.extend([v for k, v in cf.class_dict.items()])
     if 'patient' in cf.report_score_level:
         metric_classes.extend(['patient'])
     for cl in metric_classes:
         metrics['train'][cl + '_ap'] = [np.nan]
         metrics['val'][cl + '_ap'] = [np.nan]
         if cl == 'patient':
             metrics['train'][cl + '_auc'] = [np.nan]
             metrics['val'][cl + '_auc'] = [np.nan]
 
     return metrics
 
 
 
 def create_csv_output(results_list, cf, logger):
     """
     Write out test set predictions to .csv file. output format is one line per prediction:
     PatientID | PredictionID | [y1 x1 y2 x2 (z1) (z2)] | score | pred_classID
     Note, that prediction coordinates correspond to images as loaded for training/testing and need to be adapted when
     plotted over raw data (before preprocessing/resampling).
     :param results_list: [[patient_results, patient_id], [patient_results, patient_id], ...]
     """
 
     logger.info('creating csv output file at {}'.format(os.path.join(cf.test_dir, 'results.csv')))
     predictions_df = pd.DataFrame(columns = ['patientID', 'predictionID', 'coords', 'score', 'pred_classID'])
     for r in results_list:
 
         pid = r[1]
 
         #optionally load resampling info from preprocessing to match output predictions with raw data.
         #with open(os.path.join(cf.exp_dir, 'test_resampling_info', pid), 'rb') as handle:
         #    resampling_info = pickle.load(handle)
 
         for bix, box in enumerate(r[0][0]):
             if box["box_type"] == "gt":
                 continue
             assert box['box_type'] == 'det', box['box_type']
             coords = box['box_coords']
             score = box['box_score']
             pred_class_id = box['box_pred_class_id']
             out_coords = []
             if score >= cf.min_det_thresh:
                 out_coords.append(coords[0]) #* resampling_info['scale'][0])
                 out_coords.append(coords[1]) #* resampling_info['scale'][1])
                 out_coords.append(coords[2]) #* resampling_info['scale'][0])
                 out_coords.append(coords[3]) #* resampling_info['scale'][1])
                 if len(coords) > 4:
                     out_coords.append(coords[4]) #* resampling_info['scale'][2] + resampling_info['z_crop'])
                     out_coords.append(coords[5]) #* resampling_info['scale'][2] + resampling_info['z_crop'])
 
                 predictions_df.loc[len(predictions_df)] = [pid, bix, out_coords, score, pred_class_id]
     try:
         fold = cf.fold
     except:
         fold = 'hold_out'
     predictions_df.to_csv(os.path.join(cf.exp_dir, 'results_{}.csv'.format(fold)), index=False)
 
 
 
 class _AnsiColorizer(object):
     """
     A colorizer is an object that loosely wraps around a stream, allowing
     callers to write text to the stream in a particular color.
 
     Colorizer classes must implement C{supported()} and C{write(text, color)}.
     """
     _colors = dict(black=30, red=31, green=32, yellow=33,
                    blue=34, magenta=35, cyan=36, white=37, default=39)
 
     def __init__(self, stream):
         self.stream = stream
 
     @classmethod
     def supported(cls, stream=sys.stdout):
         """
         A class method that returns True if the current platform supports
         coloring terminal output using this method. Returns False otherwise.
         """
         if not stream.isatty():
             return False  # auto color only on TTYs
         try:
             import curses
         except ImportError:
             return False
         else:
             try:
                 try:
                     return curses.tigetnum("colors") > 2
                 except curses.error:
                     curses.setupterm()
                     return curses.tigetnum("colors") > 2
             except:
                 raise
                 # guess false in case of error
                 return False
 
     def write(self, text, color):
         """
         Write the given text to the stream in the given color.
 
         @param text: Text to be written to the stream.
 
         @param color: A string label for a color. e.g. 'red', 'white'.
         """
         color = self._colors[color]
         self.stream.write('\x1b[%sm%s\x1b[0m' % (color, text))
 
 
 
 class ColorHandler(logging.StreamHandler):
 
 
     def __init__(self, stream=sys.stdout):
         super(ColorHandler, self).__init__(_AnsiColorizer(stream))
 
     def emit(self, record):
         msg_colors = {
             logging.DEBUG: "green",
             logging.INFO: "default",
             logging.WARNING: "red",
             logging.ERROR: "red"
         }
         color = msg_colors.get(record.levelno, "blue")
         self.stream.write(record.msg + "\n", color)