diff --git a/datasets/toy/configs.py b/datasets/toy/configs.py
index 94288ad..b30a08e 100644
--- a/datasets/toy/configs.py
+++ b/datasets/toy/configs.py
@@ -1,490 +1,490 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 import numpy as np
 from default_configs import DefaultConfigs
 from collections import namedtuple
 
 boxLabel = namedtuple('boxLabel', ["name", "color"])
 Label = namedtuple("Label", ['id', 'name', 'shape', 'radius', 'color', 'regression', 'ambiguities', 'gt_distortion'])
 binLabel = namedtuple("binLabel", ['id', 'name', 'color', 'bin_vals'])
 
 class Configs(DefaultConfigs):
 
     def __init__(self, server_env=None):
         super(Configs, self).__init__(server_env)
 
         #########################
         #         Prepro        #
         #########################
 
         self.pp_rootdir = os.path.join('/home/gregor/datasets/toy', "cyl1ps_dev")
         self.pp_npz_dir = self.pp_rootdir+"_npz"
 
         self.pre_crop_size = [320,320,8] #y,x,z; determines pp data shape (2D easily implementable, but only 3D for now)
         self.min_2d_radius = 6 #in pixels
         self.n_train_samples, self.n_test_samples = 1200, 1000
 
         # not actually real one-hot encoding (ohe) but contains more info: roi-overlap only within classes.
         self.pp_create_ohe_seg = False
         self.pp_empty_samples_ratio = 0.1
 
         self.pp_place_radii_mid_bin = True
         self.pp_only_distort_2d = True
         # outer-most intensity of blurred radii, relative to inner-object intensity. <1 for decreasing, > 1 for increasing.
         # e.g.: setting 0.1 means blurred edge has min intensity 10% as large as inner-object intensity.
         self.pp_blur_min_intensity = 0.2
 
         self.max_instances_per_sample = 1 #how many max instances over all classes per sample (img if 2d, vol if 3d)
         self.max_instances_per_class = self.max_instances_per_sample  # how many max instances per image per class
         self.noise_scale = 0.  # std-dev of gaussian noise
 
         self.ambigs_sampling = "gaussian" #"gaussian" or "uniform"
         """ radius_calib: gt distort for calibrating uncertainty. Range of gt distortion is inferable from
             image by distinguishing it from the rest of the object.
             blurring width around edge will be shifted so that symmetric rel to orig radius.
             blurring scale: if self.ambigs_sampling is uniform, distribution's non-zero range (b-a) will be sqrt(12)*scale
             since uniform dist has variance (b-a)²/12. b,a will be placed symmetrically around unperturbed radius.
             if sampling is gaussian, then scale parameter sets one std dev, i.e., blurring width will be orig_radius * std_dev * 2.
         """
         self.ambiguities = {
              #set which classes to apply which ambs to below in class labels
              #choose out of: 'outer_radius', 'inner_radius', 'radii_relations'.
              #kind              #probability   #scale (gaussian std, relative to unperturbed value)
             #"outer_radius":     (1.,            0.5),
             #"outer_radius_xy":  (1.,            0.5),
             #"inner_radius":     (0.5,            0.1),
             #"radii_relations":  (0.5,            0.1),
             "radius_calib":     (1.,            1./6)
         }
 
         # shape choices: 'cylinder', 'block'
         #                        id,    name,       shape,      radius,                 color,              regression,     ambiguities,    gt_distortion
         self.pp_classes = [Label(1,     'cylinder', 'cylinder', ((6,6,1),(40,40,8)),    (*self.blue, 1.),   "radius_2d",    (),             ()),
                            #Label(2,      'block',      'block',        ((6,6,1),(40,40,8)),  (*self.aubergine,1.),  "radii_2d", (), ('radius_calib',))
             ]
 
 
         #########################
         #         I/O           #
         #########################
 
         self.data_sourcedir = '/home/gregor/datasets/toy/cyl1ps_dev'
 
         if server_env:
             self.data_sourcedir = '/datasets/data_ramien/toy/cyl1ps_dev_npz'
 
 
         self.test_data_sourcedir = os.path.join(self.data_sourcedir, 'test')
         self.data_sourcedir = os.path.join(self.data_sourcedir, "train")
 
         self.info_df_name = 'info_df.pickle'
 
         # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn', 'detection_fpn'].
         self.model = 'mrcnn'
         self.model_path = 'models/{}.py'.format(self.model if not 'retina' in self.model else 'retina_net')
         self.model_path = os.path.join(self.source_dir, self.model_path)
 
 
         #########################
         #      Architecture     #
         #########################
 
         # one out of [2, 3]. dimension the model operates in.
         self.dim = 2
 
         # 'class', 'regression', 'regression_bin', 'regression_ken_gal'
         # currently only tested mode is a single-task at a time (i.e., only one task in below list)
         # but, in principle, tasks could be combined (e.g., object classes and regression per class)
         self.prediction_tasks = ['class', ]
 
         self.start_filts = 48 if self.dim == 2 else 18
         self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2
         self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50'
         self.norm = 'instance_norm' # one of None, 'instance_norm', 'batch_norm'
         self.relu = 'relu'
         # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform')
         self.weight_init = None
 
         self.regression_n_features = 1  # length of regressor target vector
 
 
         #########################
         #      Data Loader      #
         #########################
 
         self.num_epochs = 32
         self.num_train_batches = 120 if self.dim == 2 else 180
         self.batch_size = 8 if self.dim == 2 else 4
 
         self.n_cv_splits = 4
         # select modalities from preprocessed data
         self.channels = [0]
         self.n_channels = len(self.channels)
 
         # which channel (mod) to show as bg in plotting, will be extra added to batch if not in self.channels
         self.plot_bg_chan = 0
         self.crop_margin = [20, 20, 1]  # has to be smaller than respective patch_size//2
         self.patch_size_2D = self.pre_crop_size[:2]
         self.patch_size_3D = self.pre_crop_size[:2]+[8]
 
         # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation.
         self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D
 
         # ratio of free sampled batch elements before class balancing is triggered
         # (>0 to include "empty"/background patches.)
         self.batch_random_ratio = 0.2
         self.balance_target = "class_targets" if 'class' in self.prediction_tasks else "rg_bin_targets"
 
         self.observables_patient = []
         self.observables_rois = []
 
         self.seed = 3 #for generating folds
 
         #############################
         # Colors, Classes, Legends  #
         #############################
         self.plot_frequency = 4
 
         binary_bin_labels = [binLabel(1,  'r<=25',      (*self.green, 1.),      (1,25)),
                              binLabel(2,  'r>25',       (*self.red, 1.),        (25,))]
         quintuple_bin_labels = [binLabel(1,  'r2-10',   (*self.green, 1.),      (2,10)),
                                 binLabel(2,  'r10-20',  (*self.yellow, 1.),     (10,20)),
                                 binLabel(3,  'r20-30',  (*self.orange, 1.),     (20,30)),
                                 binLabel(4,  'r30-40',  (*self.bright_red, 1.), (30,40)),
                                 binLabel(5,  'r>40',    (*self.red, 1.), (40,))]
 
         # choose here if to do 2-way or 5-way regression-bin classification
         task_spec_bin_labels = quintuple_bin_labels
 
         self.class_labels = [
             # regression: regression-task label, either value or "(x,y,z)_radius" or "radii".
             # ambiguities: name of above defined ambig to apply to image data (not gt); need to be iterables!
             # gt_distortion: name of ambig to apply to gt only; needs to be iterable!
             #      #id  #name   #shape  #radius     #color              #regression #ambiguities    #gt_distortion
             Label(  0,  'bg',   None,   (0, 0, 0),  (*self.white, 0.),  (0, 0, 0),  (),             ())]
         if "class" in self.prediction_tasks:
             self.class_labels += self.pp_classes
         else:
             self.class_labels += [Label(1, 'object', 'object', ('various',), (*self.orange, 1.), ('radius_2d',), ("various",), ('various',))]
 
 
         if any(['regression' in task for task in self.prediction_tasks]):
             self.bin_labels = [binLabel(0,  'bg',       (*self.white, 1.),      (0,))]
             self.bin_labels += task_spec_bin_labels
             self.bin_id2label = {label.id: label for label in self.bin_labels}
             bins = [(min(label.bin_vals), max(label.bin_vals)) for label in self.bin_labels]
             self.bin_id2rg_val = {ix: [np.mean(bin)] for ix, bin in enumerate(bins)}
             self.bin_edges = [(bins[i][1] + bins[i + 1][0]) / 2 for i in range(len(bins) - 1)]
             self.bin_dict = {label.id: label.name for label in self.bin_labels if label.id != 0}
 
         if self.class_specific_seg:
           self.seg_labels = self.class_labels
 
         self.box_type2label = {label.name: label for label in self.box_labels}
         self.class_id2label = {label.id: label for label in self.class_labels}
         self.class_dict = {label.id: label.name for label in self.class_labels if label.id != 0}
 
         self.seg_id2label = {label.id: label for label in self.seg_labels}
         self.cmap = {label.id: label.color for label in self.seg_labels}
 
         self.plot_prediction_histograms = True
         self.plot_stat_curves = False
         self.has_colorchannels = False
         self.plot_class_ids = True
 
         self.num_classes = len(self.class_dict)
         self.num_seg_classes = len(self.seg_labels)
 
         #########################
         #   Data Augmentation   #
         #########################
         self.do_aug = True
         self.da_kwargs = {
             'mirror': True,
             'mirror_axes': tuple(np.arange(0, self.dim, 1)),
             'do_elastic_deform': False,
             'alpha': (500., 1500.),
             'sigma': (40., 45.),
             'do_rotation': False,
             'angle_x': (0., 2 * np.pi),
             'angle_y': (0., 0),
             'angle_z': (0., 0),
             'do_scale': False,
             'scale': (0.8, 1.1),
             'random_crop': False,
             'rand_crop_dist': (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3),
             'border_mode_data': 'constant',
             'border_cval_data': 0,
             'order_data': 1
         }
 
         if self.dim == 3:
             self.da_kwargs['do_elastic_deform'] = False
             self.da_kwargs['angle_x'] = (0, 0.0)
             self.da_kwargs['angle_y'] = (0, 0.0)  # must be 0!!
             self.da_kwargs['angle_z'] = (0., 2 * np.pi)
 
         #########################
         #  Schedule / Selection #
         #########################
 
         # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training)
         # the former is morge accurate, while the latter is faster (depending on volume size)
         self.val_mode = 'val_sampling' # one of 'val_sampling' , 'val_patient'
         if self.val_mode == 'val_patient':
             self.max_val_patients = 220  # if 'all' iterates over entire val_set once.
         if self.val_mode == 'val_sampling':
             self.num_val_batches = 35 if self.dim==2 else 25
 
         self.save_n_models = 2
         self.min_save_thresh = 1 if self.dim == 2 else 1  # =wait time in epochs
         if "class" in self.prediction_tasks:
             self.model_selection_criteria = {name + "_ap": 1. for name in self.class_dict.values()}
         elif any("regression" in task for task in self.prediction_tasks):
             self.model_selection_criteria = {name + "_ap": 0.2 for name in self.class_dict.values()}
             self.model_selection_criteria.update({name + "_avp": 0.8 for name in self.class_dict.values()})
 
         self.lr_decay_factor = 0.5
-        self.scheduling_patience = int(self.num_epochs / 5)
+        self.scheduling_patience = np.ceil(1800 / (self.num_train_batches * self.batch_size))
         self.weight_decay = 1e-5
         self.clip_norm = None  # number or None
 
         #########################
         #   Testing / Plotting  #
         #########################
 
         self.test_aug_axes = (0,1,(0,1)) # None or list: choices are 0,1,(0,1)
         self.held_out_test_set = True
         self.max_test_patients = "all"  # number or "all" for all
 
         self.test_against_exact_gt = True # only True implemented
         self.val_against_exact_gt = False # True is an unrealistic --> irrelevant scenario.
         self.report_score_level = ['rois']  # 'patient' or 'rois' (incl)
         self.patient_class_of_interest = 1
         self.patient_bin_of_interest = 2
 
         self.eval_bins_separately = False#"additionally" if not 'class' in self.prediction_tasks else False
         self.metrics = ['ap', 'auc', 'dice']
         if any(['regression' in task for task in self.prediction_tasks]):
             self.metrics += ['avp', 'rg_MAE_weighted', 'rg_MAE_weighted_tp',
                              'rg_bin_accuracy_weighted', 'rg_bin_accuracy_weighted_tp']
         if 'aleatoric' in self.model:
             self.metrics += ['rg_uncertainty', 'rg_uncertainty_tp', 'rg_uncertainty_tp_weighted']
         self.evaluate_fold_means = True
 
         self.ap_match_ious = [0.5]  # threshold(s) for considering a prediction as true positive
         self.min_det_thresh = 0.3
 
         self.model_max_iou_resolution = 0.2
 
         # aggregation method for test and val_patient predictions.
         # wbc = weighted box clustering as in https://arxiv.org/pdf/1811.08661.pdf,
         # nms = standard non-maximum suppression, or None = no clustering
         self.clustering = 'wbc'
         # iou thresh (exclusive!) for regarding two preds as concerning the same ROI
         self.clustering_iou = self.model_max_iou_resolution  # has to be larger than desired possible overlap iou of model predictions
 
         self.merge_2D_to_3D_preds = False
         self.merge_3D_iou = self.model_max_iou_resolution
         self.n_test_plots = 1  # per fold and rank
 
         self.test_n_epochs = self.save_n_models  # should be called n_test_ens, since is number of models to ensemble over during testing
         # is multiplied by (1 + nr of test augs)
 
         #########################
         #   Assertions          #
         #########################
         if not 'class' in self.prediction_tasks:
             assert self.num_classes == 1
 
         #########################
         #   Add model specifics #
         #########################
 
         {'mrcnn': self.add_mrcnn_configs, 'mrcnn_aleatoric': self.add_mrcnn_configs,
          'retina_net': self.add_mrcnn_configs, 'retina_unet': self.add_mrcnn_configs,
          'detection_unet': self.add_det_unet_configs, 'detection_fpn': self.add_det_fpn_configs
          }[self.model]()
 
     def rg_val_to_bin_id(self, rg_val):
         #only meant for isotropic radii!!
         # only 2D radii (x and y dims) or 1D (x or y) are expected
         return np.round(np.digitize(rg_val, self.bin_edges).mean())
 
 
     def add_det_fpn_configs(self):
 
       self.learning_rate = [1 * 1e-4] * self.num_epochs
       self.dynamic_lr_scheduling = True
       self.scheduling_criterion = 'torch_loss'
       self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max'
 
       self.n_roi_candidates = 4 if self.dim == 2 else 6
       # max number of roi candidates to identify per image (slice in 2D, volume in 3D)
 
       # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
       self.seg_loss_mode = 'wce'
       self.wce_weights = [1] * self.num_seg_classes if 'dice' in self.seg_loss_mode else [0.1, 1]
 
       self.fp_dice_weight = 1 if self.dim == 2 else 1
       # if <1, false positive predictions in foreground are penalized less.
 
       self.detection_min_confidence = 0.05
       # how to determine score of roi: 'max' or 'median'
       self.score_det = 'max'
 
     def add_det_unet_configs(self):
 
       self.learning_rate = [1 * 1e-4] * self.num_epochs
       self.dynamic_lr_scheduling = True
       self.scheduling_criterion = "torch_loss"
       self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max'
 
       # max number of roi candidates to identify per image (slice in 2D, volume in 3D)
       self.n_roi_candidates = 4 if self.dim == 2 else 6
 
       # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
       self.seg_loss_mode = 'wce'
       self.wce_weights = [1] * self.num_seg_classes if 'dice' in self.seg_loss_mode else [0.1, 1]
       # if <1, false positive predictions in foreground are penalized less.
       self.fp_dice_weight = 1 if self.dim == 2 else 1
 
       self.detection_min_confidence = 0.05
       # how to determine score of roi: 'max' or 'median'
       self.score_det = 'max'
 
       self.init_filts = 32
       self.kernel_size = 3  # ks for horizontal, normal convs
       self.kernel_size_m = 2  # ks for max pool
       self.pad = "same"  # "same" or integer, padding of horizontal convs
 
     def add_mrcnn_configs(self):
 
       self.learning_rate = [1e-4] * self.num_epochs
       self.dynamic_lr_scheduling = True  # with scheduler set in exec
       self.scheduling_criterion = max(self.model_selection_criteria, key=self.model_selection_criteria.get)
       self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max'
 
       # number of classes for network heads: n_foreground_classes + 1 (background)
       self.head_classes = self.num_classes + 1 if 'class' in self.prediction_tasks else 2
 
       # feed +/- n neighbouring slices into channel dimension. set to None for no context.
       self.n_3D_context = None
       if self.n_3D_context is not None and self.dim == 2:
         self.n_channels *= (self.n_3D_context * 2 + 1)
 
       self.detect_while_training = True
       # disable the re-sampling of mask proposals to original size for speed-up.
       # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching),
       # mask outputs are optional.
       self.return_masks_in_train = True
       self.return_masks_in_val = True
       self.return_masks_in_test = True
 
       # feature map strides per pyramid level are inferred from architecture. anchor scales are set accordingly.
       self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]}
       # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale
       # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.)
       self.rpn_anchor_scales = {'xy': [[4], [8], [16], [32]], 'z': [[1], [2], [4], [8]]}
       # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3.
       self.pyramid_levels = [0, 1, 2, 3]
       # number of feature maps in rpn. typically lowered in 3D to save gpu-memory.
       self.n_rpn_features = 512 if self.dim == 2 else 64
 
       # anchor ratios and strides per position in feature maps.
       self.rpn_anchor_ratios = [0.5, 1., 2.]
       self.rpn_anchor_stride = 1
       # Threshold for first stage (RPN) non-maximum suppression (NMS):  LOWER == HARDER SELECTION
       self.rpn_nms_threshold = max(0.8, self.model_max_iou_resolution)
 
       # loss sampling settings.
       self.rpn_train_anchors_per_image = 4
       self.train_rois_per_image = 6 # per batch_instance
       self.roi_positive_ratio = 0.5
       self.anchor_matching_iou = 0.8
 
       # k negative example candidates are drawn from a pool of size k*shem_poolsize (stochastic hard-example mining),
       # where k<=#positive examples.
       self.shem_poolsize = 2
 
       self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3)
       self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5)
       self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10)
 
       self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
       self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
       self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1], 0, self.patch_size_3D[2]])
       self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1],
                              self.patch_size_3D[2], self.patch_size_3D[2]])  # y1,x1,y2,x2,z1,z2
 
       if self.dim == 2:
         self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4]
         self.bbox_std_dev = self.bbox_std_dev[:4]
         self.window = self.window[:4]
         self.scale = self.scale[:4]
 
       self.plot_y_max = 1.5
       self.n_plot_rpn_props = 5 if self.dim == 2 else 30  # per batch_instance (slice in 2D / patient in 3D)
 
       # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element.
       self.pre_nms_limit = 2000 if self.dim == 2 else 4000
 
       # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True,
       # since proposals of the entire batch are forwarded through second stage as one "batch".
       self.roi_chunk_size = 1300 if self.dim == 2 else 500
       self.post_nms_rois_training = 200 * (self.head_classes-1) if self.dim == 2 else 400
       self.post_nms_rois_inference = 200 * (self.head_classes-1)
 
       # Final selection of detections (refine_detections)
       self.model_max_instances_per_batch_element = 9 if self.dim == 2 else 18 # per batch element and class.
       self.detection_nms_threshold = self.model_max_iou_resolution  # needs to be > 0, otherwise all predictions are one cluster.
       self.model_min_confidence = 0.2  # iou for nms in box refining (directly after heads), should be >0 since ths>=x in mrcnn.py
 
       if self.dim == 2:
         self.backbone_shapes = np.array(
           [[int(np.ceil(self.patch_size[0] / stride)),
             int(np.ceil(self.patch_size[1] / stride))]
            for stride in self.backbone_strides['xy']])
       else:
         self.backbone_shapes = np.array(
           [[int(np.ceil(self.patch_size[0] / stride)),
             int(np.ceil(self.patch_size[1] / stride)),
             int(np.ceil(self.patch_size[2] / stride_z))]
            for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z']
                                        )])
 
       if self.model == 'retina_net' or self.model == 'retina_unet':
         # whether to use focal loss or SHEM for loss-sample selection
         self.focal_loss = False
         # implement extra anchor-scales according to https://arxiv.org/abs/1708.02002
         self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                         self.rpn_anchor_scales['xy']]
         self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                        self.rpn_anchor_scales['z']]
         self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3
 
         # pre-selection of detections for NMS-speedup. per entire batch.
         self.pre_nms_limit = (500 if self.dim == 2 else 6250) * self.batch_size
 
         # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002
         self.anchor_matching_iou = 0.7
 
         if self.model == 'retina_unet':
           self.operate_stride1 = True
diff --git a/datasets/toy_mdt/check_data.py b/datasets/toy_mdt/check_data.py
new file mode 100644
index 0000000..3e00f08
--- /dev/null
+++ b/datasets/toy_mdt/check_data.py
@@ -0,0 +1,84 @@
+"""
+Created at 4/8/20 4:24 PM
+@author: gregor 
+"""
+
+import os, time
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pickle
+import argparse
+from multiprocessing import Pool
+from collections import OrderedDict
+
+from matplotlib import pyplot as plt
+import matplotlib.gridspec as gridspec
+
+def load_data(cf):
+    pp_data_path = cf.pp_data_path
+    p_df = pd.read_pickle(os.path.join(pp_data_path, cf.input_df_name))
+
+    class_targets = p_df['class_id'].tolist()
+    pids = p_df.pid.tolist()
+    imgs = [os.path.join(pp_data_path, '{}.npy'.format(pid)) for pid in pids]
+    segs = [os.path.join(pp_data_path,'{}.npy'.format(pid)) for pid in pids]
+
+    data = OrderedDict()
+    for ix, pid in enumerate(pids):
+
+        data[pid] = {'data': imgs[ix], 'seg': segs[ix], 'pid': pid, 'class_target': [class_targets[ix]]}
+
+    return data
+
+
+def plot_data_and_gt(cf, data, n_samples=14, out_dir=None):
+
+    sample_keys = np.random.choice(list(data.keys()), size=n_samples)
+
+    fig = plt.figure(figsize=(n_samples*2, 4))
+    grid = gridspec.GridSpec(2,n_samples)
+
+    for s_ix, skey in enumerate(sample_keys):
+        sample = data[skey]
+        img = np.load(sample["data"])[0]
+        seg = np.load(sample["seg"])[1]
+        gt_class = sample["class_target"]
+        ax = fig.add_subplot(grid[0, s_ix])
+        ax.imshow(img)
+        ax.set_title("img")
+        ax.axis("off")
+        ax = fig.add_subplot(grid[1, s_ix])
+        ax.imshow(seg)
+        ax.set_title("seg. gt_class: {}".format(gt_class))
+        ax.axis("off")
+
+
+    if out_dir is not None:
+        out_file = out_dir / "check_samples.png"
+        plt.savefig(str(out_file), dpi=200, bbox_inches="tight")
+
+
+
+
+if __name__ == '__main__':
+    stime = time.time()
+    import sys
+    sys.path.append("../..")
+    import utils.exp_utils as utils
+
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args()
+
+
+    cf_file = utils.import_module("cf", "configs.py")
+    cf = cf_file.configs()
+
+    data = load_data(cf)
+    plot_data_and_gt(cf, data, out_dir=Path("/media/gregor/HDD1/experiments/mdt/toy_1x/data_check"))
+
+
+    mins, secs = divmod((time.time() - stime), 60)
+    h, mins = divmod(mins, 60)
+    t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
+    print("{} total runtime: {}".format(os.path.split(__file__)[1], t))
\ No newline at end of file
diff --git a/datasets/toy_mdt/configs.py b/datasets/toy_mdt/configs.py
new file mode 100644
index 0000000..7ca0d8e
--- /dev/null
+++ b/datasets/toy_mdt/configs.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python
+# Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+import numpy as np
+from collections import namedtuple
+from default_configs import DefaultConfigs
+
+Label = namedtuple("Label", ['id', 'name', 'color'])
+class Configs(DefaultConfigs):
+
+    def __init__(self, server_env=None):
+        #########################
+        #    Preprocessing      #
+        #########################
+
+        self.root_dir = '/home/gregor/datasets/toy_mdt'
+
+        #########################
+        #         I/O           #
+        #########################
+
+
+        # one out of [2, 3]. dimension the model operates in.
+        self.dim = 2
+        DefaultConfigs.__init__(self, server_env, self.dim)
+        # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn'].
+        self.model = 'retina_net'
+        self.model_path = 'models/{}.py'.format(self.model if not 'retina' in self.model else 'retina_net')
+        self.model_path = os.path.join(self.source_dir, self.model_path)
+        # int [0 < dataset_size]. select n patients from dataset for prototyping.
+        self.select_prototype_subset = None
+        self.held_out_test_set = True
+        self.n_train_data = 2500
+
+        # choose one of the 3 toy experiments described in https://arxiv.org/pdf/1811.08661.pdf
+        # one of ['donuts_shape', 'donuts_pattern', 'circles_scale'].
+        toy_mode = 'donuts_shape_noise'
+
+        # path to preprocessed data.
+        self.info_df_name = 'info_df.pickle'
+        self.pp_name = os.path.join(toy_mode, 'train')
+        self.data_sourcedir = os.path.join(self.root_dir, self.pp_name)
+        self.pp_test_name = os.path.join(toy_mode, 'test')
+        self.test_data_sourcedir = os.path.join(self.root_dir, self.pp_test_name)
+
+        # settings for deployment in cloud.
+        if server_env:
+            # path to preprocessed data.
+            pp_root_dir = '/datasets/datasets_ramien/toy_exp/data'
+            self.pp_name = os.path.join(toy_mode, 'train')
+            self.pp_data_path = os.path.join(pp_root_dir, self.pp_name)
+            self.pp_test_name = os.path.join(toy_mode, 'test')
+            self.test_data_sourcedir = os.path.join(pp_root_dir, self.pp_test_name)
+            self.select_prototype_subset = None
+
+        #########################
+        #      Data Loader      #
+        #########################
+
+        # select modalities from preprocessed data
+        self.channels = [0]
+        self.n_channels = len(self.channels)
+        self.plot_bg_chan = 0
+
+        # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation.
+        self.pre_crop_size_2D = [320, 320]
+        self.patch_size_2D = [320, 320]
+
+        self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D
+        self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D
+
+        # ratio of free sampled batch elements before class balancing is triggered
+        # (>0 to include "empty"/background patches.)
+        self.batch_random_ratio = 0.2
+
+        # set 2D network to operate in 3D images.
+        self.merge_2D_to_3D_preds = False
+
+        # feed +/- n neighbouring slices into channel dimension. set to None for no context.
+        self.n_3D_context = None
+        if self.n_3D_context is not None and self.dim == 2:
+            self.n_channels *= (self.n_3D_context * 2 + 1)
+
+
+        #########################
+        #      Architecture      #
+        #########################
+
+        self.start_filts = 48 if self.dim == 2 else 18
+        self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2
+        self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50'
+        self.norm = None # one of None, 'instance_norm', 'batch_norm'
+
+        # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform')
+        self.weight_init = None
+
+        # compatibility
+        self.regression_n_features = 1
+
+        #########################
+        #  Schedule / Selection #
+        #########################
+
+        self.num_epochs = 23
+        self.num_train_batches = 100 if self.dim == 2 else 200
+        self.batch_size = 20 if self.dim == 2 else 8
+
+        self.do_validation = True
+        # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training)
+        # the former is morge accurate, while the latter is faster (depending on volume size)
+        self.val_mode = 'val_patient' # one of 'val_sampling' , 'val_patient'
+        if self.val_mode == 'val_patient':
+            self.max_val_patients = "all"  # if 'None' iterates over entire val_set once.
+        if self.val_mode == 'val_sampling':
+            self.num_val_batches = 50
+
+        # set dynamic_lr_scheduling to True to apply LR scheduling with below settings.
+        self.dynamic_lr_scheduling = True
+        self.lr_decay_factor = 0.5
+        self.scheduling_patience = np.ceil(3600 / (self.num_train_batches * self.batch_size))
+        self.scheduling_criterion = 'donuts_ap'
+        self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max'
+        self.weight_decay = 0
+        self.clip_norm = None
+
+        #########################
+        #   Testing / Plotting  #
+        #########################
+
+        # set the top-n-epochs to be saved for temporal averaging in testing.
+        self.save_n_models = 5
+        self.test_n_epochs = 5
+        self.test_aug_axes = (0, 1, (0, 1))
+        self.n_test_plots = 2
+        self.clustering = "wbc"
+        self.clustering_iou = 1e-5
+        # set a minimum epoch number for saving in case of instabilities in the first phase of training.
+        self.min_save_thresh = 0 if self.dim == 2 else 0
+
+        self.report_score_level = ['patient', 'rois']  # choose list from 'patient', 'rois'
+        self.class_labels = [Label(0, 'bg', (*self.white, 0.)),
+                             Label(1, 'circles', (*self.orange, .9)),
+                             Label(2, 'donuts', (*self.blue, .9)),]
+        if self.class_specific_seg:
+            self.seg_labels = self.class_labels
+
+        self.box_type2label = {label.name: label for label in self.box_labels}
+        self.class_id2label = {label.id: label for label in self.class_labels}
+        self.class_dict = {label.id: label.name for label in self.class_labels if label.id != 0}
+
+        self.seg_id2label = {label.id: label for label in self.seg_labels}
+        self.cmap = {label.id: label.color for label in self.seg_labels}
+
+
+        self.patient_class_of_interest = 2  # patient metrics are only plotted for one class.
+        self.ap_match_ious = [0.1]  # list of ious to be evaluated for ap-scoring.
+
+        self.model_selection_criteria = {name + "_ap": 1. for name in self.class_dict.values()}# criteria to average over for saving epochs.
+        self.min_det_thresh = 0.1  # minimum confidence value to select predictions for evaluation.
+
+
+        self.plot_prediction_histograms = True
+        self.plot_stat_curves = False
+        self.plot_class_ids = True
+
+        #########################
+        #   Data Augmentation   #
+        #########################
+        self.do_aug = False
+        self.da_kwargs={
+        'do_elastic_deform': True,
+        'alpha':(0., 1500.),
+        'sigma':(30., 50.),
+        'do_rotation':True,
+        'angle_x': (0., 2 * np.pi),
+        'angle_y': (0., 0),
+        'angle_z': (0., 0),
+        'do_scale': True,
+        'scale':(0.8, 1.1),
+        'random_crop':False,
+        'rand_crop_dist':  (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3),
+        'border_mode_data': 'constant',
+        'border_cval_data': 0,
+        'order_data': 1
+        }
+
+        if self.dim == 3:
+            self.da_kwargs['do_elastic_deform'] = False
+            self.da_kwargs['angle_x'] = (0, 0.0)
+            self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!!
+            self.da_kwargs['angle_z'] = (0., 2 * np.pi)
+
+
+        #########################
+        #   Add model specifics #
+        #########################
+
+        {'detection_unet': self.add_det_unet_configs,
+         'mrcnn': self.add_mrcnn_configs,
+         'retina_net': self.add_mrcnn_configs,
+         'retina_unet': self.add_mrcnn_configs,
+        }[self.model]()
+
+
+    def add_det_unet_configs(self):
+
+        self.learning_rate = [1e-4] * self.num_epochs
+
+        # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median']
+        self.aggregation_operation = 'max'
+
+        # max number of roi candidates to identify per image (slice in 2D, volume in 3D)
+        self.n_roi_candidates = 3 if self.dim == 2 else 8
+
+        # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
+        self.seg_loss_mode = 'wce'
+
+        # if <1, false positive predictions in foreground are penalized less.
+        self.fp_dice_weight = 1 if self.dim == 2 else 1
+
+        self.wce_weights = [1, 1, 1]
+        self.detection_min_confidence = self.min_det_thresh
+
+        # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic).
+        self.num_seg_classes = 3
+        self.head_classes = self.num_seg_classes
+
+    def add_mrcnn_configs(self):
+
+        # learning rate is a list with one entry per epoch.
+        self.learning_rate = [1e-3] * self.num_epochs
+
+        # disable mask head loss. (e.g. if no pixelwise annotations available)
+        self.frcnn_mode = False
+
+        # disable the re-sampling of mask proposals to original size for speed-up.
+        # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching),
+        # mask-outputs are optional.
+        self.return_masks_in_val = True
+        self.return_masks_in_test = False
+
+        # set number of proposal boxes to plot after each epoch.
+        self.n_plot_rpn_props = 0 if self.dim == 2 else 0
+
+        # number of classes for head networks: n_foreground_classes + 1 (background)
+        self.head_classes = 3
+
+        # seg_classes hier refers to the first stage classifier (RPN)
+        self.num_seg_classes = 2  # foreground vs. background
+
+        # feature map strides per pyramid level are inferred from architecture.
+        self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]}
+
+        # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale
+        # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.)
+        self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]}
+
+        # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3.
+        self.pyramid_levels = [0, 1, 2, 3]
+
+        # number of feature maps in rpn. typically lowered in 3D to save gpu-memory.
+        self.n_rpn_features = 512 if self.dim == 2 else 128
+
+        # anchor ratios and strides per position in feature maps.
+        self.rpn_anchor_ratios = [0.5, 1., 2.]
+        self.rpn_anchor_stride = 1
+
+        # Threshold for first stage (RPN) non-maximum suppression (NMS):  LOWER == HARDER SELECTION
+        self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7
+
+        # loss sampling settings.
+        self.rpn_train_anchors_per_image = 64 #per batch element
+        self.train_rois_per_image = 2 #per batch element
+        self.roi_positive_ratio = 0.5
+        self.anchor_matching_iou = 0.7
+
+        # factor of top-k candidates to draw from  per negative sample (stochastic-hard-example-mining).
+        # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples.
+        self.shem_poolsize = 10
+
+        self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3)
+        self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5)
+        self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10)
+
+        self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
+        self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
+        self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1]])
+        self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1]])
+
+        if self.dim == 2:
+            self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4]
+            self.bbox_std_dev = self.bbox_std_dev[:4]
+            self.window = self.window[:4]
+            self.scale = self.scale[:4]
+
+        # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element.
+        self.pre_nms_limit = 3000 if self.dim == 2 else 6000
+
+        # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True,
+        # since proposals of the entire batch are forwarded through second stage in as one "batch".
+        self.roi_chunk_size = 800 if self.dim == 2 else 600
+        self.post_nms_rois_training = 500 if self.dim == 2 else 75
+        self.post_nms_rois_inference = 500
+
+        # Final selection of detections (refine_detections)
+        self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30  # per batch element and class.
+        self.detection_nms_threshold = 1e-5  # needs to be > 0, otherwise all predictions are one cluster.
+        self.model_min_confidence = 0.1
+
+        if self.dim == 2:
+            self.backbone_shapes = np.array(
+                [[int(np.ceil(self.patch_size[0] / stride)),
+                  int(np.ceil(self.patch_size[1] / stride))]
+                 for stride in self.backbone_strides['xy']])
+        else:
+            self.backbone_shapes = np.array(
+                [[int(np.ceil(self.patch_size[0] / stride)),
+                  int(np.ceil(self.patch_size[1] / stride)),
+                  int(np.ceil(self.patch_size[2] / stride_z))]
+                 for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z']
+                                             )])
+        if self.model == 'ufrcnn':
+            self.operate_stride1 = True
+            self.num_seg_classes = 3
+            self.frcnn_mode = True
+
+        if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector':
+            # implement extra anchor-scales according to retina-net publication.
+            self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
+                                            self.rpn_anchor_scales['xy']]
+            self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
+                                           self.rpn_anchor_scales['z']]
+            self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3
+
+            self.n_rpn_features = 256 if self.dim == 2 else 64
+
+            # pre-selection of detections for NMS-speedup. per entire batch.
+            self.pre_nms_limit = 10000 if self.dim == 2 else 50000
+
+            # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002
+            self.anchor_matching_iou = 0.5
+
+            # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic).
+            self.num_seg_classes = 3
+
+            if self.model == 'retina_unet':
+                self.operate_stride1 = True
diff --git a/datasets/toy_mdt/data_loader.py b/datasets/toy_mdt/data_loader.py
new file mode 100644
index 0000000..08f36e3
--- /dev/null
+++ b/datasets/toy_mdt/data_loader.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+# Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import sys
+sys.path.append('../') # works on cluster indep from where sbatch job is started
+import plotting as plg
+
+import numpy as np
+import os
+from multiprocessing import Lock
+from collections import OrderedDict
+import pandas as pd
+import pickle
+import time
+
+# batch generator tools from https://github.com/MIC-DKFZ/batchgenerators
+from batchgenerators.transforms.spatial_transforms import MirrorTransform as Mirror
+from batchgenerators.transforms.abstract_transforms import Compose
+from batchgenerators.dataloading.multi_threaded_augmenter import MultiThreadedAugmenter
+from batchgenerators.transforms.spatial_transforms import SpatialTransform
+from batchgenerators.transforms.crop_and_pad_transforms import CenterCropTransform
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+import utils.dataloader_utils as dutils
+from utils.dataloader_utils import ConvertSegToBoundingBoxCoordinates
+
+
+def load_obj(file_path):
+    with open(file_path, 'rb') as handle:
+        return pickle.load(handle)
+
+class Dataset(dutils.Dataset):
+    r""" Load a dict holding memmapped arrays and clinical parameters for each patient,
+    evtly subset of those.
+        If server_env: copy and evtly unpack (npz->npy) data in cf.data_rootdir to
+        cf.data_dir.
+    :param cf: config file
+    :param folds: number of folds out of @params n_cv folds to include
+    :param n_cv: number of total folds
+    :return: dict with imgs, segs, pids, class_labels, observables
+    """
+
+    def __init__(self, cf, logger, subset_ids=None, data_sourcedir=None, mode='train'):
+        super(Dataset,self).__init__(cf, data_sourcedir=data_sourcedir)
+
+        p_df = pd.read_pickle(os.path.join(self.data_dir, cf.info_df_name))
+
+        if subset_ids is not None:
+            p_df = p_df[p_df.pid.isin(subset_ids)]
+            logger.info('subset: selected {} instances from df'.format(len(p_df)))
+
+        pids = p_df.pid.tolist()
+        #evtly copy data from data_sourcedir to data_dest
+        if cf.server_env and not hasattr(cf, "data_dir"):
+            file_subset = [os.path.join(self.data_dir, '{}.*'.format(pid)) for pid in pids]
+            file_subset += [os.path.join(self.data_dir, '{}_seg.*'.format(pid)) for pid in pids]
+            file_subset += [cf.info_df_name]
+            self.copy_data(cf, file_subset=file_subset)
+
+        img_paths = [os.path.join(self.data_dir, '{}.npy'.format(pid)) for pid in pids]
+        seg_paths = [os.path.join(self.data_dir, '{}.npy'.format(pid)) for pid in pids]
+
+        class_targets = p_df['class_id'].tolist()
+
+        self.data = OrderedDict()
+        for ix, pid in enumerate(pids):
+            self.data[pid] = {'data': img_paths[ix], 'seg': seg_paths[ix], 'pid': pid}
+            self.data[pid]['class_targets'] = np.array([class_targets[ix]], dtype='uint8') + 1
+
+        cf.roi_items = ['class_targets']
+
+        self.set_ids = np.array(list(self.data.keys()))
+        self.df = None
+
+class BatchGenerator(dutils.BatchGenerator):
+    """
+    creates the training/validation batch generator. Samples n_batch_size patients (draws a slice from each patient if 2D)
+    from the data set while maintaining foreground-class balance. Returned patches are cropped/padded to pre_crop_size.
+    Actual patch_size is obtained after data augmentation.
+    :param data: data dictionary as provided by 'load_dataset'.
+    :param batch_size: number of patients to sample for the batch
+    :return dictionary containing the batch data (b, c, x, y, (z)) / seg (b, 1, x, y, (z)) / pids / class_target
+    """
+    def __init__(self, cf, data, sample_pids_w_replace=True, max_batches=None, raise_stop_iteration=False, seed=0):
+        super(BatchGenerator, self).__init__(cf, data, sample_pids_w_replace=sample_pids_w_replace,
+                                             max_batches=max_batches, raise_stop_iteration=raise_stop_iteration,
+                                             seed=seed)
+
+        self.chans = cf.channels if cf.channels is not None else np.index_exp[:]
+        assert hasattr(self.chans, "__iter__"), "self.chans has to be list-like to maintain dims when slicing"
+
+        self.crop_margin = np.array(self.cf.patch_size) / 8.  # min distance of ROI center to edge of cropped_patch.
+        self.p_fg = 0.5
+        self.empty_samples_max_ratio = 0.6
+
+        self.balance_target_distribution(plot=sample_pids_w_replace)
+
+    def generate_train_batch(self):
+        # everything done in here is per batch
+        # print statements in here get confusing due to multithreading
+
+        batch_pids = self.get_batch_pids()
+
+        batch_data, batch_segs, batch_patient_targets = [], [], []
+        batch_roi_items = {name: [] for name in self.cf.roi_items}
+        # record roi count and empty count of classes in batch
+        # empty count for no presence of resp. class in whole sample (empty slices in 2D/patients in 3D)
+        batch_roi_counts = np.zeros((len(self.unique_ts),), dtype='uint32')
+        batch_empty_counts = np.zeros((len(self.unique_ts),), dtype='uint32')
+
+        for b in range(len(batch_pids)):
+            patient = self._data[batch_pids[b]]
+
+            all_data = np.load(patient['data'], mmap_mode='r')
+            data = all_data[0].astype('float16')[np.newaxis]
+            seg = all_data[1].astype('uint8')
+
+            spatial_shp = data[0].shape
+            assert spatial_shp == seg.shape, "spatial shape incongruence betw. data and seg"
+            if np.any([spatial_shp[ix] < self.cf.pre_crop_size[ix] for ix in range(len(spatial_shp))]):
+                new_shape = [np.max([spatial_shp[ix], self.cf.pre_crop_size[ix]]) for ix in range(len(spatial_shp))]
+                data = dutils.pad_nd_image(data, (len(data), *new_shape))
+                seg = dutils.pad_nd_image(seg, new_shape)
+
+            batch_data.append(data)
+            batch_segs.append(seg[np.newaxis])
+
+            for o in batch_roi_items: #after loop, holds every entry of every batchpatient per observable
+                    batch_roi_items[o].append(patient[o])
+
+            for tix in range(len(self.unique_ts)):
+                non_zero = np.count_nonzero(patient[self.balance_target][np.unique(seg[seg>0]) - 1] == self.unique_ts[tix])
+                batch_roi_counts[tix] += non_zero
+                batch_empty_counts[tix] += int(non_zero == 0)
+                # todo remove assert when checked
+                if not np.any(seg):
+                    assert non_zero==0
+
+        batch = {'data': np.array(batch_data), 'seg': np.array(batch_segs).astype('uint8'),
+                 'pid': batch_pids,
+                 'roi_counts': batch_roi_counts, 'empty_counts': batch_empty_counts}
+        for key,val in batch_roi_items.items(): #extend batch dic by entries of observables dic
+            batch[key] = np.array(val)
+
+        return batch
+
+class PatientBatchIterator(dutils.PatientBatchIterator):
+    """
+    creates a test generator that iterates over entire given dataset returning 1 patient per batch.
+    Can be used for monitoring if cf.val_mode = 'patient_val' for a monitoring closer to actually evaluation (done in 3D),
+    if willing to accept speed-loss during training.
+    Specific properties of toy data set: toy data may be created with added ground-truth noise. thus, there are
+    exact ground truths (GTs) and noisy ground truths available. the normal or noisy GTs are used in training by
+    the BatchGenerator. The PatientIterator, however, may use the exact GTs if set in configs.
+
+    :return: out_batch: dictionary containing one patient with batch_size = n_3D_patches in 3D or
+    batch_size = n_2D_patches in 2D .
+    """
+
+    def __init__(self, cf, data, mode='test'):
+        super(PatientBatchIterator, self).__init__(cf, data)
+
+        self.patch_size = cf.patch_size_2D + [1] if cf.dim == 2 else cf.patch_size_3D
+        self.chans = cf.channels if cf.channels is not None else np.index_exp[:]
+        assert hasattr(self.chans, "__iter__"), "self.chans has to be list-like to maintain dims when slicing"
+
+        self.patient_ix = 0  # running index over all patients in set
+
+    def generate_train_batch(self, pid=None):
+
+        if pid is None:
+            pid = self.dataset_pids[self.patient_ix]
+        patient = self._data[pid]
+
+        # already swapped dimensions in pp from (c,)z,y,x to c,y,x,z or h,w,d to ease 2D/3D-case handling
+        all_data = np.load(patient['data'], mmap_mode='r')
+        data = all_data[0].astype('float16')[np.newaxis]
+        seg = all_data[1].astype('uint8')[np.newaxis]
+
+        data_shp_raw = data.shape
+        data = data[self.chans]
+        spatial_shp = data[0].shape  # spatial dims need to be in order x,y,z
+        assert spatial_shp == seg[0].shape, "spatial shape incongruence betw. data and seg"
+
+        out_data = data[None]
+        out_seg = seg[None]
+
+        batch_2D = {'data': out_data, 'seg': out_seg}
+        for o in self.cf.roi_items:
+            batch_2D[o] = np.repeat(np.array([patient[o]]), len(out_data), axis=0)
+        converter = ConvertSegToBoundingBoxCoordinates(2, self.cf.roi_items, False, self.cf.class_specific_seg)
+        batch_2D = converter(**batch_2D)
+
+        batch_2D.update({'patient_bb_target': batch_2D['bb_target'],
+                         'original_img_shape': out_data.shape})
+        for o in self.cf.roi_items:
+            batch_2D["patient_" + o] = batch_2D[o]
+
+        out_batch = batch_2D
+        out_batch.update({'pid': np.array([patient['pid']] * len(out_data))})
+
+        self.patient_ix += 1
+        if self.patient_ix == len(self.dataset_pids):
+            self.patient_ix = 0
+
+        return out_batch
+
+
+def create_data_gen_pipeline(cf, patient_data, do_aug=True, **kwargs):
+    """
+    create mutli-threaded train/val/test batch generation and augmentation pipeline.
+    :param patient_data: dictionary containing one dictionary per patient in the train/test subset.
+    :param is_training: (optional) whether to perform data augmentation (training) or not (validation/testing)
+    :return: multithreaded_generator
+    """
+
+    # create instance of batch generator as first element in pipeline.
+    data_gen = BatchGenerator(cf, patient_data, **kwargs)
+
+    my_transforms = []
+    if do_aug:
+        if cf.da_kwargs["mirror"]:
+            mirror_transform = Mirror(axes=cf.da_kwargs['mirror_axes'])
+            my_transforms.append(mirror_transform)
+
+        spatial_transform = SpatialTransform(patch_size=cf.patch_size[:cf.dim],
+                                             patch_center_dist_from_border=cf.da_kwargs['rand_crop_dist'],
+                                             do_elastic_deform=cf.da_kwargs['do_elastic_deform'],
+                                             alpha=cf.da_kwargs['alpha'], sigma=cf.da_kwargs['sigma'],
+                                             do_rotation=cf.da_kwargs['do_rotation'], angle_x=cf.da_kwargs['angle_x'],
+                                             angle_y=cf.da_kwargs['angle_y'], angle_z=cf.da_kwargs['angle_z'],
+                                             do_scale=cf.da_kwargs['do_scale'], scale=cf.da_kwargs['scale'],
+                                             random_crop=cf.da_kwargs['random_crop'])
+
+        my_transforms.append(spatial_transform)
+    else:
+        my_transforms.append(CenterCropTransform(crop_size=cf.patch_size[:cf.dim]))
+
+    my_transforms.append(ConvertSegToBoundingBoxCoordinates(cf.dim, cf.roi_items, False, cf.class_specific_seg))
+    all_transforms = Compose(my_transforms)
+    # multithreaded_generator = SingleThreadedAugmenter(data_gen, all_transforms)
+    multithreaded_generator = MultiThreadedAugmenter(data_gen, all_transforms, num_processes=data_gen.n_filled_threads,
+                                                     seeds=range(data_gen.n_filled_threads))
+    return multithreaded_generator
+
+def get_train_generators(cf, logger, data_statistics=False):
+    """
+    wrapper function for creating the training batch generator pipeline. returns the train/val generators.
+    selects patients according to cv folds (generated by first run/fold of experiment):
+    splits the data into n-folds, where 1 split is used for val, 1 split for testing and the rest for training. (inner loop test set)
+    If cf.hold_out_test_set is True, adds the test split to the training data.
+    """
+    dataset = Dataset(cf, logger)
+
+    train_ids = dataset.set_ids[:cf.n_train_data]
+    val_ids = dataset.set_ids[1000:1500]
+
+    train_data = {k: v for (k, v) in dataset.data.items() if str(k) in train_ids}
+    val_data = {k: v for (k, v) in dataset.data.items() if str(k) in val_ids}
+
+    logger.info("data set loaded with: {} train / {} val patients".format(len(train_ids), len(val_ids)))
+    if data_statistics:
+        dataset.calc_statistics(subsets={"train": train_ids, "val": val_ids}, plot_dir=
+        os.path.join(cf.plot_dir,"dataset"))
+
+
+
+    batch_gen = {}
+    batch_gen['train'] = create_data_gen_pipeline(cf, train_data, do_aug=cf.do_aug, sample_pids_w_replace=True)
+    if cf.val_mode == 'val_patient':
+        batch_gen['val_patient'] = PatientBatchIterator(cf, val_data, mode='validation')
+        batch_gen['n_val'] = len(val_ids) if cf.max_val_patients=="all" else min(len(val_ids), cf.max_val_patients)
+    elif cf.val_mode == 'val_sampling':
+        batch_gen['n_val'] = int(np.ceil(len(val_data)/cf.batch_size)) if cf.num_val_batches == "all" else cf.num_val_batches
+        # in current setup, val loader is used like generator. with max_batches being applied in train routine.
+        batch_gen['val_sampling'] = create_data_gen_pipeline(cf, val_data, do_aug=False, sample_pids_w_replace=False,
+                                                             max_batches=None, raise_stop_iteration=False)
+
+    return batch_gen
+
+def get_test_generator(cf, logger):
+    """
+    if get_test_generators is possibly called multiple times in server env, every time of
+    Dataset initiation rsync will check for copying the data; this should be okay
+    since rsync will not copy if files already exist in destination.
+    """
+
+    if cf.held_out_test_set:
+        sourcedir = cf.test_data_sourcedir
+        test_ids = None
+    else:
+        sourcedir = None
+        with open(os.path.join(cf.exp_dir, 'fold_ids.pickle'), 'rb') as handle:
+            set_splits = pickle.load(handle)
+        test_ids = set_splits[cf.fold]
+
+    test_set = Dataset(cf, logger, subset_ids=test_ids, data_sourcedir=sourcedir, mode='test')
+    logger.info("data set loaded with: {} test patients".format(len(test_set.set_ids)))
+    batch_gen = {}
+    batch_gen['test'] = PatientBatchIterator(cf, test_set.data)
+    batch_gen['n_test'] = len(test_set.set_ids) if cf.max_test_patients=="all" else \
+        min(cf.max_test_patients, len(test_set.set_ids))
+
+    return batch_gen
+
+
+if __name__=="__main__":
+
+    import utils.exp_utils as utils
+    from datasets.toy.configs import Configs
+
+    cf = Configs()
+
+    total_stime = time.time()
+    times = {}
+
+    # cf.server_env = True
+    # cf.data_dir = "experiments/dev_data"
+
+    cf.exp_dir = "experiments/dev/"
+    cf.plot_dir = cf.exp_dir + "plots"
+    os.makedirs(cf.exp_dir, exist_ok=True)
+    cf.fold = 0
+    logger = utils.get_logger(cf.exp_dir)
+    gens = get_train_generators(cf, logger)
+    train_loader = gens['train']
+    for i in range(0):
+        stime = time.time()
+        print("producing training batch nr ", i)
+        ex_batch = next(train_loader)
+        times["train_batch"] = time.time() - stime
+        #experiments/dev/dev_exbatch_{}.png".format(i)
+        plg.view_batch(cf, ex_batch, out_file="experiments/dev/dev_exbatch_{}.png".format(i), show_gt_labels=True, vmin=0, show_info=False)
+
+
+    val_loader = gens['val_sampling']
+    stime = time.time()
+    for i in range(1):
+        ex_batch = next(val_loader)
+        times["val_batch"] = time.time() - stime
+        stime = time.time()
+        #"experiments/dev/dev_exvalbatch_{}.png"
+        plg.view_batch(cf, ex_batch, out_file="experiments/dev/dev_exvalbatch_{}.png".format(i), show_gt_labels=True, vmin=0, show_info=True)
+        times["val_plot"] = time.time() - stime
+    #
+    test_loader = get_test_generator(cf, logger)["test"]
+    stime = time.time()
+    ex_batch = test_loader.generate_train_batch(pid=None)
+    times["test_batch"] = time.time() - stime
+    stime = time.time()
+    plg.view_batch(cf, ex_batch, show_gt_labels=True, out_file="experiments/dev/dev_expatchbatch.png", vmin=0)
+    times["test_patchbatch_plot"] = time.time() - stime
+
+
+
+    print("Times recorded throughout:")
+    for (k, v) in times.items():
+        print(k, "{:.2f}".format(v))
+
+    mins, secs = divmod((time.time() - total_stime), 60)
+    h, mins = divmod(mins, 60)
+    t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
+    print("{} total runtime: {}".format(os.path.split(__file__)[1], t))
\ No newline at end of file
diff --git a/datasets/toy_mdt/generate_toys.py b/datasets/toy_mdt/generate_toys.py
new file mode 100644
index 0000000..8f0c35b
--- /dev/null
+++ b/datasets/toy_mdt/generate_toys.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os, time
+import numpy as np
+import pandas as pd
+import pickle
+import argparse
+from multiprocessing import Pool
+
+def multi_processing_create_image(inputs):
+
+
+    out_dir, six, foreground_margin, class_diameters, mode, noisy_bg = inputs
+    print('processing {} {}'.format(out_dir, six))
+
+    img = np.random.rand(320, 320) if noisy_bg else np.zeros((320, 320))
+    seg = np.zeros((320, 320)).astype('uint8')
+    center_x = np.random.randint(foreground_margin, img.shape[0] - foreground_margin)
+    center_y = np.random.randint(foreground_margin, img.shape[1] - foreground_margin)
+    class_id = np.random.randint(0, 2)
+
+    for y in range(img.shape[0]):
+        for x in range(img.shape[0]):
+            if ((x - center_x) ** 2 + (y - center_y) ** 2 - class_diameters[class_id] ** 2) < 0:
+                img[y][x] += 0.2
+                seg[y][x] = 1
+
+    if 'donuts' in mode:
+        hole_diameter = 4
+        if class_id == 1:
+            for y in range(img.shape[0]):
+                for x in range(img.shape[0]):
+                    if ((x - center_x) ** 2 + (y - center_y) ** 2 - hole_diameter ** 2) < 0:
+                        img[y][x] -= 0.2
+                        if mode == 'donuts_shape':
+                            seg[y][x] = 0
+
+    out = np.concatenate((img[None], seg[None]))
+    out_path = os.path.join(out_dir, '{}.npy'.format(six))
+    np.save(out_path, out)
+
+    with open(os.path.join(out_dir, 'meta_info_{}.pickle'.format(six)), 'wb') as handle:
+        pickle.dump([out_path, class_id, str(six)], handle)
+
+
+def generate_experiment(cf, exp_name, n_train_images, n_test_images, mode, class_diameters=(20, 20), noisy_bg=False):
+
+    train_dir = os.path.join(cf.root_dir, exp_name, 'train')
+    test_dir = os.path.join(cf.root_dir, exp_name, 'test')
+    os.makedirs(train_dir, exist_ok=True)
+    os.makedirs(test_dir, exist_ok=True)
+
+    # enforced distance between object center and image edge.
+    foreground_margin = int(np.ceil(np.max(class_diameters) / 1.25))
+
+    info = []
+    info += [[train_dir, six, foreground_margin, class_diameters, mode, noisy_bg] for six in range(n_train_images)]
+    info += [[test_dir, six, foreground_margin, class_diameters, mode, noisy_bg] for six in range(n_test_images)]
+
+    print('starting creation of {} images'.format(len(info)))
+    pool = Pool(processes=os.cpu_count()-1)
+    pool.map(multi_processing_create_image, info)
+    pool.close()
+    pool.join()
+
+    aggregate_meta_info(train_dir)
+    aggregate_meta_info(test_dir)
+
+
+def aggregate_meta_info(exp_dir):
+
+    files = [os.path.join(exp_dir, f) for f in os.listdir(exp_dir) if 'meta_info' in f]
+    df = pd.DataFrame(columns=['path', 'class_id', 'pid'])
+    for f in files:
+        with open(f, 'rb') as handle:
+            df.loc[len(df)] = pickle.load(handle)
+
+    df.to_pickle(os.path.join(exp_dir, 'info_df.pickle'))
+    print ("aggregated meta info to df with length", len(df))
+
+
+if __name__ == '__main__':
+    stime = time.time()
+    import sys
+    sys.path.append("../..")
+    import utils.exp_utils as utils
+
+    parser = argparse.ArgumentParser()
+    mode_choices = ['donuts_shape', 'donuts_pattern', 'circles_scale']
+    parser.add_argument('-m', '--modes', nargs='+', type=str, default=mode_choices, choices=mode_choices)
+    parser.add_argument('--noise', action='store_true', help="if given, add noise to the sample bg.")
+    parser.add_argument('--n_train', type=int, default=1500, help="Nr. of train images to generate.")
+    parser.add_argument('--n_test', type=int, default=1000, help="Nr. of test images to generate.")
+    args = parser.parse_args()
+
+
+    cf_file = utils.import_module("cf", "configs.py")
+    cf = cf_file.configs()
+
+    class_diameters = {
+        'donuts_shape': (20, 20),
+        'donuts_pattern': (20, 20),
+        'circles_scale': (19, 20)
+    }
+
+    for mode in args.modes:
+        generate_experiment(cf, mode + ("_noise" if args.noise else ""), n_train_images=args.n_train, n_test_images=args.n_test, mode=mode,
+                            class_diameters=class_diameters[mode], noisy_bg=args.noise)
+
+
+    mins, secs = divmod((time.time() - stime), 60)
+    h, mins = divmod(mins, 60)
+    t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
+    print("{} total runtime: {}".format(os.path.split(__file__)[1], t))
+
+
diff --git a/exec.py b/exec.py
index 5d46dd6..7c7df4f 100644
--- a/exec.py
+++ b/exec.py
@@ -1,341 +1,341 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """ execution script. this where all routines come together and the only script you need to call.
     refer to parse args below to see options for execution.
 """
 
 import plotting as plg
 
 import os
 import warnings
 import argparse
 import time
 
 import torch
 
 import utils.exp_utils as utils
 from evaluator import Evaluator
 from predictor import Predictor
 
 
 for msg in ["Attempting to set identical bottom==top results",
             "This figure includes Axes that are not compatible with tight_layout",
             "Data has no positive values, and therefore cannot be log-scaled.",
             ".*invalid value encountered in true_divide.*"]:
     warnings.filterwarnings("ignore", msg)
 
 
 def train(cf, logger):
     """
     performs the training routine for a given fold. saves plots and selected parameters to the experiment dir
     specified in the configs. logs to file and tensorboard.
     """
     logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format(
         cf.dim, cf.fold, cf.exp_dir, cf.model))
     logger.time("train_val")
 
     # -------------- inits and settings -----------------
     net = model.net(cf, logger).cuda()
     if cf.optimizer == "ADAM":
         optimizer = torch.optim.Adam(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay)
     elif cf.optimizer == "SGD":
         optimizer = torch.optim.SGD(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay, momentum=0.3)
     if cf.dynamic_lr_scheduling:
         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor,
                                                                     patience=cf.scheduling_patience)
     model_selector = utils.ModelSelector(cf, logger)
 
     starting_epoch = 1
     if cf.resume:
         checkpoint_path = os.path.join(cf.fold_dir, "last_state.pth")
         starting_epoch, net, optimizer, model_selector = \
             utils.load_checkpoint(checkpoint_path, net, optimizer, model_selector)
         logger.info('resumed from checkpoint {} to epoch {}'.format(checkpoint_path, starting_epoch))
 
     # prepare monitoring
     monitor_metrics = utils.prepare_monitoring(cf)
 
     logger.info('loading dataset and initializing batch generators...')
     batch_gen = data_loader.get_train_generators(cf, logger)
 
     # -------------- training -----------------
     for epoch in range(starting_epoch, cf.num_epochs + 1):
 
         logger.info('starting training epoch {}/{}'.format(epoch, cf.num_epochs))
         logger.time("train_epoch")
 
         net.train()
 
         train_results_list = []
         train_evaluator = Evaluator(cf, logger, mode='train')
 
         for i in range(cf.num_train_batches):
             logger.time("train_batch_loadfw")
             batch = next(batch_gen['train'])
             batch_gen['train'].generator.stats['roi_counts'] += batch['roi_counts']
             batch_gen['train'].generator.stats['empty_counts'] += batch['empty_counts']
 
             logger.time("train_batch_loadfw")
             logger.time("train_batch_netfw")
             results_dict = net.train_forward(batch)
             logger.time("train_batch_netfw")
             logger.time("train_batch_bw")
             optimizer.zero_grad()
             results_dict['torch_loss'].backward()
             if cf.clip_norm:
                 torch.nn.utils.clip_grad_norm_(net.parameters(), cf.clip_norm, norm_type=2) # gradient clipping
             optimizer.step()
             train_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"])) # slim res dict
             if not cf.server_env:
                 print("\rFinished training batch " +
                       "{}/{} in {:.1f}s ({:.2f}/{:.2f} forw load/net, {:.2f} backw).".format(i+1, cf.num_train_batches,
                                                                                              logger.get_time("train_batch_loadfw")+
                                                                                              logger.get_time("train_batch_netfw")
                                                                                              +logger.time("train_batch_bw"),
                                                                                              logger.get_time("train_batch_loadfw",reset=True),
                                                                                              logger.get_time("train_batch_netfw", reset=True),
                                                                                              logger.get_time("train_batch_bw", reset=True)), end="", flush=True)
         print()
 
         #--------------- train eval ----------------
         if (epoch-1)%cf.plot_frequency==0:
             # view an example batch
             utils.split_off_process(plg.view_batch, cf, batch, results_dict, has_colorchannels=cf.has_colorchannels,
                                     show_gt_labels=True, get_time="train-example plot",
                                     out_file=os.path.join(cf.plot_dir, 'batch_example_train_{}.png'.format(cf.fold)))
 
 
         logger.time("evals")
         _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(train_results_list, monitor_metrics['train'])
         logger.time("evals")
         logger.time("train_epoch", toggle=False)
         del train_results_list
 
         #----------- validation ------------
         logger.info('starting validation in mode {}.'.format(cf.val_mode))
         logger.time("val_epoch")
         with torch.no_grad():
             net.eval()
             val_results_list = []
             val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)
             val_predictor = Predictor(cf, net, logger, mode='val')
 
             for i in range(batch_gen['n_val']):
                 logger.time("val_batch")
                 batch = next(batch_gen[cf.val_mode])
                 if cf.val_mode == 'val_patient':
                     results_dict = val_predictor.predict_patient(batch)
                 elif cf.val_mode == 'val_sampling':
                     results_dict = net.train_forward(batch, is_validation=True)
                 val_results_list.append([results_dict, batch["pid"]])
                 if not cf.server_env:
                     print("\rFinished validation {} {}/{} in {:.1f}s.".format('patient' if cf.val_mode=='val_patient' else 'batch',
                                                                               i + 1, batch_gen['n_val'],
                                                                               logger.time("val_batch")), end="", flush=True)
             print()
 
             #------------ val eval -------------
             if (epoch - 1) % cf.plot_frequency == 0:
                 utils.split_off_process(plg.view_batch, cf, batch, results_dict, has_colorchannels=cf.has_colorchannels,
                                         show_gt_labels=True, get_time="val-example plot",
                                         out_file=os.path.join(cf.plot_dir, 'batch_example_val_{}.png'.format(cf.fold)))
 
             logger.time("evals")
             _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(val_results_list, monitor_metrics['val'])
 
             model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch)
             del val_results_list
             #----------- monitoring -------------
             monitor_metrics.update({"lr": 
                 {str(g) : group['lr'] for (g, group) in enumerate(optimizer.param_groups)}})
             logger.metrics2tboard(monitor_metrics, global_step=epoch)
             logger.time("evals")
 
             logger.info('finished epoch {}/{}, took {:.2f}s. train total: {:.2f}s, average: {:.2f}s. val total: {:.2f}s, average: {:.2f}s.'.format(
                 epoch, cf.num_epochs, logger.get_time("train_epoch")+logger.time("val_epoch"), logger.get_time("train_epoch"),
                 logger.get_time("train_epoch", reset=True)/cf.num_train_batches, logger.get_time("val_epoch"),
                 logger.get_time("val_epoch", reset=True)/batch_gen["n_val"]))
             logger.info("time for evals: {:.2f}s".format(logger.get_time("evals", reset=True)))
 
         #-------------- scheduling -----------------
         if cf.dynamic_lr_scheduling:
             scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
         else:
             for param_group in optimizer.param_groups:
                 param_group['lr'] = cf.learning_rate[epoch-1]
 
     logger.time("train_val")
     logger.info("Training and validating over {} epochs took {}".format(cf.num_epochs, logger.get_time("train_val", format="hms", reset=True)))
     batch_gen['train'].generator.print_stats(logger, plot=True)
 
 def test(cf, logger, max_fold=None):
     """performs testing for a given fold (or held out set). saves stats in evaluator.
     """
     logger.time("test_fold")
     logger.info('starting testing model of fold {} in exp {}'.format(cf.fold, cf.exp_dir))
     net = model.net(cf, logger).cuda()
     batch_gen = data_loader.get_test_generator(cf, logger)
 
     test_predictor = Predictor(cf, net, logger, mode='test')
     test_results_list = test_predictor.predict_test_set(batch_gen, return_results = not hasattr(
         cf, "eval_test_separately") or not cf.eval_test_separately)
 
     if test_results_list is not None:
         test_evaluator = Evaluator(cf, logger, mode='test')
         test_evaluator.evaluate_predictions(test_results_list)
         test_evaluator.score_test_df(max_fold=max_fold)
 
     logger.info('Testing of fold {} took {}.\n'.format(cf.fold, logger.get_time("test_fold", reset=True, format="hms")))
 
 if __name__ == '__main__':
     stime = time.time()
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--dataset_name', type=str, default='toy',
                         help="path to the dataset-specific code in source_dir/datasets")
     parser.add_argument('--exp_dir', type=str, default='/home/gregor/Documents/regrcnn/datasets/toy/experiments/dev',
                         help='path to experiment dir. will be created if non existent.')
     parser.add_argument('-m', '--mode', type=str,  default='train_test', help='one out of: create_exp, analysis, train, train_test, or test')
     parser.add_argument('-f', '--folds', nargs='+', type=int, default=None, help='None runs over all folds in CV. otherwise specify list of folds.')
     parser.add_argument('--server_env', default=False, action='store_true', help='change IO settings to deploy models on a cluster.')
     parser.add_argument('--data_dest', type=str, default=None, help="path to final data folder if different from config")
     parser.add_argument('--use_stored_settings', default=False, action='store_true',
                         help='load configs from existing exp_dir instead of source dir. always done for testing, '
                              'but can be set to true to do the same for training. useful in job scheduler environment, '
                              'where source code might change before the job actually runs.')
     parser.add_argument('--resume', action="store_true", default=False,
                         help='if given, resume from checkpoint(s) of the specified folds.')
     parser.add_argument('-d', '--dev', default=False, action='store_true', help="development mode: shorten everything")
 
     args = parser.parse_args()
     args.dataset_name = os.path.join("datasets", args.dataset_name) if not "datasets" in args.dataset_name else args.dataset_name
     folds = args.folds
     resume = None if args.resume in ['None', 'none'] else args.resume
 
     if args.mode == 'create_exp':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=False)
         logger = utils.get_logger(cf.exp_dir, cf.server_env, -1)
         logger.info('created experiment directory at {}'.format(args.exp_dir))
 
     elif args.mode == 'train' or args.mode == 'train_test':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, args.use_stored_settings)
         if args.dev:
             folds = [0,1]
-            cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 2, 0, 1
+            cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 2, 0, 2
             cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 5, 1, 1
-            cf.test_n_epochs =  cf.save_n_models
+            cf.test_n_epochs = cf.save_n_models
             cf.max_test_patients = 1
             torch.backends.cudnn.benchmark = cf.dim==3
         else:
             torch.backends.cudnn.benchmark = cf.cuda_benchmark
         if args.data_dest is not None:
             cf.data_dest = args.data_dest
             
         logger = utils.get_logger(cf.exp_dir, cf.server_env, cf.sysmetrics_interval)
         data_loader = utils.import_module('data_loader', os.path.join(args.dataset_name, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
         for fold in folds:
             """k-fold cross-validation: the dataset is split into k equally-sized folds, one used for validation,
             one for testing, the rest for training. This loop iterates k-times over the dataset, cyclically moving the
             splits. k==folds, fold in [0,folds) says which split is used for testing.
             """
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold)); cf.fold = fold
             logger.set_logfile(fold=fold)
             cf.resume = resume
             if not os.path.exists(cf.fold_dir):
                 os.mkdir(cf.fold_dir)
             train(cf, logger)
             cf.resume = None
             if args.mode == 'train_test':
                 test(cf, logger)
 
     elif args.mode == 'test':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=True, is_training=False)
         if args.data_dest is not None:
             cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env, cf.sysmetrics_interval)
         data_loader = utils.import_module('data_loader', os.path.join(args.dataset_name, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
 
         fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if
                      os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")])
         if folds is None:
             folds = range(cf.n_cv_splits)
         if args.dev:
             folds = folds[:2]
             cf.batch_size, cf.max_test_patients, cf.test_n_epochs = 1 if cf.dim==2 else 1, 2, 2
         else:
             torch.backends.cudnn.benchmark = cf.cuda_benchmark
         for fold in folds:
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold)); cf.fold = fold
             logger.set_logfile(fold=fold)
             if cf.fold_dir in fold_dirs:
                 test(cf, logger, max_fold=max([int(f[-1]) for f in fold_dirs]))
             else:
                 logger.info("Skipping fold {} since no model parameters found.".format(fold))
     # load raw predictions saved by predictor during testing, run aggregation algorithms and evaluation.
     elif args.mode == 'analysis':
         """ analyse already saved predictions.
         """
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=True, is_training=False)
         logger = utils.get_logger(cf.exp_dir, cf.server_env, cf.sysmetrics_interval)
 
         if cf.held_out_test_set and not cf.eval_test_fold_wise:
             predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
             results_list = predictor.load_saved_predictions()
             logger.info('starting evaluation...')
             cf.fold = 0
             evaluator = Evaluator(cf, logger, mode='test')
             evaluator.evaluate_predictions(results_list)
             evaluator.score_test_df(max_fold=0)
         else:
             fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if
                          os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")])
             if args.dev:
                 fold_dirs = fold_dirs[:1]
             if folds is None:
                 folds = range(cf.n_cv_splits)
             for fold in folds:
                 cf.fold = fold; cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(cf.fold))
                 logger.set_logfile(fold=fold)
                 if cf.fold_dir in fold_dirs:
                     predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
                     results_list = predictor.load_saved_predictions()
                     # results_list[x][1] is pid, results_list[x][0] is list of len samples-per-patient, each entry hlds
                     # list of boxes per that sample, i.e., len(results_list[x][y][0]) would be nr of boxes in sample y of patient x
                     logger.info('starting evaluation...')
                     evaluator = Evaluator(cf, logger, mode='test')
                     evaluator.evaluate_predictions(results_list)
                     max_fold = max([int(f[-1]) for f in fold_dirs])
                     evaluator.score_test_df(max_fold=max_fold)
                 else:
                     logger.info("Skipping fold {} since no model parameters found.".format(fold))
     else:
         raise ValueError('mode "{}" specified in args is not implemented.'.format(args.mode))
         
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t))
     del logger
     torch.cuda.empty_cache()
 
diff --git a/predictor.py b/predictor.py
index 99035bd..d8063f2 100644
--- a/predictor.py
+++ b/predictor.py
@@ -1,1005 +1,1006 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os
 from multiprocessing import Pool
 import pickle
 import time
 
 import numpy as np
 import torch
 from scipy.stats import norm
 from collections import OrderedDict
 
 import plotting as plg
 import utils.model_utils as mutils
 import utils.exp_utils as utils
 
 
 def get_mirrored_patch_crops(patch_crops, org_img_shape):
     mirrored_patch_crops = []
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], ii[2], ii[3]]
                                  if len(ii) == 4 else [org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], ii[2],
                                                        ii[3], ii[4], ii[5]]
                                  for ii in patch_crops])
 
     mirrored_patch_crops.append([[ii[0], ii[1], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2]]
                                  if len(ii) == 4 else [ii[0], ii[1], org_img_shape[3] - ii[3],
                                                        org_img_shape[3] - ii[2], ii[4], ii[5]]
                                  for ii in patch_crops])
 
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2]]
                                  if len(ii) == 4 else
                                  [org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2], ii[4], ii[5]]
                                  for ii in patch_crops])
 
     return mirrored_patch_crops
 
 def get_mirrored_patch_crops_ax_dep(patch_crops, org_img_shape, mirror_axes):
     mirrored_patch_crops = []
     for ax_ix, axes in enumerate(mirror_axes):
         if isinstance(axes, (int, float)) and int(axes) == 0:
             mirrored_patch_crops.append([[org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], ii[2], ii[3]]
                                          if len(ii) == 4 else [org_img_shape[2] - ii[1], org_img_shape[2] - ii[0],
                                                                ii[2], ii[3], ii[4], ii[5]]
                                          for ii in patch_crops])
         elif isinstance(axes, (int, float)) and int(axes) == 1:
             mirrored_patch_crops.append([[ii[0], ii[1], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2]]
                                          if len(ii) == 4 else [ii[0], ii[1], org_img_shape[3] - ii[3],
                                                                org_img_shape[3] - ii[2], ii[4], ii[5]]
                                          for ii in patch_crops])
         elif hasattr(axes, "__iter__") and (tuple(axes) == (0, 1) or tuple(axes) == (1, 0)):
             mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                           org_img_shape[2] - ii[0],
                                           org_img_shape[3] - ii[3],
                                           org_img_shape[3] - ii[2]]
                                          if len(ii) == 4 else
                                          [org_img_shape[2] - ii[1],
                                           org_img_shape[2] - ii[0],
                                           org_img_shape[3] - ii[3],
                                           org_img_shape[3] - ii[2], ii[4], ii[5]]
                                          for ii in patch_crops])
         else:
             raise Exception("invalid mirror axes {} in get mirrored patch crops".format(axes))
 
     return mirrored_patch_crops
 
 def apply_wbc_to_patient(inputs):
     """
     wrapper around prediction box consolidation: weighted box clustering (wbc). processes a single patient.
     loops over batch elements in patient results (1 in 3D, slices in 2D) and foreground classes,
     aggregates and stores results in new list.
     :return. patient_results_list: list over batch elements. each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D
                                  predictions, and a dummy batch dimension of 1 for 3D predictions.
     :return. pid: string. patient id.
     """
     regress_flag, in_patient_results_list, pid, class_dict, clustering_iou, n_ens = inputs
     out_patient_results_list = [[] for _ in range(len(in_patient_results_list))]
 
     for bix, b in enumerate(in_patient_results_list):
 
         for cl in list(class_dict.keys()):
 
             boxes = [(ix, box) for ix, box in enumerate(b) if
                      (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             box_coords = np.array([b[1]['box_coords'] for b in boxes])
             box_scores = np.array([b[1]['box_score'] for b in boxes])
             box_center_factor = np.array([b[1]['box_patch_center_factor'] for b in boxes])
             box_n_overlaps = np.array([b[1]['box_n_overlaps'] for b in boxes])
             try:
                 box_patch_id = np.array([b[1]['patch_id'] for b in boxes])
             except KeyError: #backward compatibility for already saved pred results ... omg
                 box_patch_id = np.array([b[1]['ens_ix'] for b in boxes])
             box_regressions = np.array([b[1]['regression'] for b in boxes]) if regress_flag else None
             box_rg_bins = np.array([b[1]['rg_bin'] if 'rg_bin' in b[1].keys() else float('NaN') for b in boxes])
             box_rg_uncs = np.array([b[1]['rg_uncertainty'] if 'rg_uncertainty' in b[1].keys() else float('NaN') for b in boxes])
 
             if 0 not in box_scores.shape:
                 keep_scores, keep_coords, keep_n_missing, keep_regressions, keep_rg_bins, keep_rg_uncs = \
                     weighted_box_clustering(box_coords, box_scores, box_center_factor, box_n_overlaps, box_rg_bins, box_rg_uncs,
                                              box_regressions, box_patch_id, clustering_iou, n_ens)
 
 
                 for boxix in range(len(keep_scores)):
                     clustered_box = {'box_type': 'det', 'box_coords': keep_coords[boxix],
                                      'box_score': keep_scores[boxix], 'cluster_n_missing': keep_n_missing[boxix],
                                      'box_pred_class_id': cl}
                     if regress_flag:
                         clustered_box.update({'regression': keep_regressions[boxix],
                                               'rg_uncertainty': keep_rg_uncs[boxix],
                                               'rg_bin': keep_rg_bins[boxix]})
 
                     out_patient_results_list[bix].append(clustered_box)
 
         # add gt boxes back to new output list.
         out_patient_results_list[bix].extend([box for box in b if box['box_type'] == 'gt'])
 
     return [out_patient_results_list, pid]
 
 
 def weighted_box_clustering(box_coords, scores, box_pc_facts, box_n_ovs, box_rg_bins, box_rg_uncs,
                              box_regress, box_patch_id, thresh, n_ens):
     """Consolidates overlapping predictions resulting from patch overlaps, test data augmentations and temporal ensembling.
     clusters predictions together with iou > thresh (like in NMS). Output score and coordinate for one cluster are the
     average weighted by individual patch center factors (how trustworthy is this candidate measured by how centered
     its position within the patch is) and the size of the corresponding box.
     The number of expected predictions at a position is n_data_aug * n_temp_ens * n_overlaps_at_position
     (1 prediction per unique patch). Missing predictions at a cluster position are defined as the number of unique
     patches in the cluster, which did not contribute any predict any boxes.
     :param dets: (n_dets, (y1, x1, y2, x2, (z1), (z2), scores, box_pc_facts, box_n_ovs).
     :param box_coords: y1, x1, y2, x2, (z1), (z2).
     :param scores: confidence scores.
     :param box_pc_facts: patch-center factors from position on patch tiles.
     :param box_n_ovs: number of patch overlaps at box position.
     :param box_rg_bins: regression bin predictions.
     :param box_rg_uncs: (n_dets,) regression uncertainties (from model mrcnn_aleatoric).
     :param box_regress: (n_dets, n_regression_features).
     :param box_patch_id: ensemble index.
     :param thresh: threshold for iou_matching.
     :param n_ens: number of models, that are ensembled. (-> number of expected predictions per position).
     :return: keep_scores: (n_keep)  new scores of boxes to be kept.
     :return: keep_coords: (n_keep, (y1, x1, y2, x2, (z1), (z2)) new coordinates of boxes to be kept.
     """
 
     dim = 2 if box_coords.shape[1] == 4 else 3
     y1 = box_coords[:,0]
     x1 = box_coords[:,1]
     y2 = box_coords[:,2]
     x2 = box_coords[:,3]
 
     areas = (y2 - y1 + 1) * (x2 - x1 + 1)
     if dim == 3:
         z1 = box_coords[:, 4]
         z2 = box_coords[:, 5]
         areas *= (z2 - z1 + 1)
 
     # order is the sorted index.  maps order to index o[1] = 24 (rank1, ix 24)
     order = scores.argsort()[::-1]
 
     keep_scores = []
     keep_coords = []
     keep_n_missing = []
     keep_regress = []
     keep_rg_bins = []
     keep_rg_uncs = []
 
     while order.size > 0:
         i = order[0]  # highest scoring element
         yy1 = np.maximum(y1[i], y1[order])
         xx1 = np.maximum(x1[i], x1[order])
         yy2 = np.minimum(y2[i], y2[order])
         xx2 = np.minimum(x2[i], x2[order])
 
         w = np.maximum(0, xx2 - xx1 + 1)
         h = np.maximum(0, yy2 - yy1 + 1)
         inter = w * h
 
         if dim == 3:
             zz1 = np.maximum(z1[i], z1[order])
             zz2 = np.minimum(z2[i], z2[order])
             d = np.maximum(0, zz2 - zz1 + 1)
             inter *= d
 
         # overlap between currently highest scoring box and all boxes.
         ovr = inter / (areas[i] + areas[order] - inter)
         ovr_fl = inter.astype('float64') / (areas[i] + areas[order] - inter.astype('float64'))
         assert np.all(ovr==ovr_fl), "ovr {}\n ovr_float {}".format(ovr, ovr_fl)
         # get all the predictions that match the current box to build one cluster.
         matches = np.nonzero(ovr > thresh)[0]
 
         match_n_ovs = box_n_ovs[order[matches]]
         match_pc_facts = box_pc_facts[order[matches]]
         match_patch_id = box_patch_id[order[matches]]
         match_ov_facts = ovr[matches]
         match_areas = areas[order[matches]]
         match_scores = scores[order[matches]]
 
         # weight all scores in cluster by patch factors, and size.
         match_score_weights = match_ov_facts * match_areas * match_pc_facts
         match_scores *= match_score_weights
 
         # for the weighted average, scores have to be divided by the number of total expected preds at the position
         # of the current cluster. 1 Prediction per patch is expected. therefore, the number of ensembled models is
         # multiplied by the mean overlaps of  patches at this position (boxes of the cluster might partly be
         # in areas of different overlaps).
         n_expected_preds = n_ens * np.mean(match_n_ovs)
         # the number of missing predictions is obtained as the number of patches,
         # which did not contribute any prediction to the current cluster.
         n_missing_preds = np.max((0, n_expected_preds - np.unique(match_patch_id).shape[0]))
 
         # missing preds are given the mean weighting
         # (expected prediction is the mean over all predictions in cluster).
         denom = np.sum(match_score_weights) + n_missing_preds * np.mean(match_score_weights)
 
         # compute weighted average score for the cluster
         avg_score = np.sum(match_scores) / denom
 
         # compute weighted average of coordinates for the cluster. now only take existing
         # predictions into account.
         avg_coords = [np.sum(y1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(y2[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x2[order[matches]] * match_scores) / np.sum(match_scores)]
 
         if dim == 3:
             avg_coords.append(np.sum(z1[order[matches]] * match_scores) / np.sum(match_scores))
             avg_coords.append(np.sum(z2[order[matches]] * match_scores) / np.sum(match_scores))
 
         if box_regress is not None:
             # compute wt. avg. of regression vectors (component-wise average)
             avg_regress = np.sum(box_regress[order[matches]] * match_scores[:, np.newaxis], axis=0) / np.sum(
                 match_scores)
             avg_rg_bins = np.round(np.sum(box_rg_bins[order[matches]] * match_scores) / np.sum(match_scores))
             avg_rg_uncs = np.sum(box_rg_uncs[order[matches]] * match_scores) / np.sum(match_scores)
         else:
             avg_regress = np.array(float('NaN'))
             avg_rg_bins = np.array(float('NaN'))
             avg_rg_uncs = np.array(float('NaN'))
 
         # some clusters might have very low scores due to high amounts of missing predictions.
         # filter out the with a conservative threshold, to speed up evaluation.
         if avg_score > 0.01:
             keep_scores.append(avg_score)
             keep_coords.append(avg_coords)
             keep_n_missing.append((n_missing_preds / n_expected_preds * 100))  # relative
             keep_regress.append(avg_regress)
             keep_rg_uncs.append(avg_rg_uncs)
             keep_rg_bins.append(avg_rg_bins)
 
         # get index of all elements that were not matched and discard all others.
         inds = np.nonzero(ovr <= thresh)[0]
         inds_where = np.where(ovr<=thresh)[0]
         assert np.all(inds == inds_where), "inds_nonzero {} \ninds_where {}".format(inds, inds_where)
         order = order[inds]
 
     return keep_scores, keep_coords, keep_n_missing, keep_regress, keep_rg_bins, keep_rg_uncs
 
 
 def apply_nms_to_patient(inputs):
 
     in_patient_results_list, pid, class_dict, iou_thresh = inputs
     out_patient_results_list = []
 
 
     # collect box predictions over batch dimension (slices) and store slice info as slice_ids.
     for batch in in_patient_results_list:
         batch_el_boxes = []
         for cl in list(class_dict.keys()):
             det_boxes = [box for box in batch if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
 
             box_coords = np.array([box['box_coords'] for box in det_boxes])
             box_scores = np.array([box['box_score'] for box in det_boxes])
             if 0 not in box_scores.shape:
                 keep_ix = mutils.nms_numpy(box_coords, box_scores, iou_thresh)
             else:
                 keep_ix = []
 
             batch_el_boxes += [det_boxes[ix] for ix in keep_ix]
 
         batch_el_boxes += [box for box in batch if box['box_type'] == 'gt']
         out_patient_results_list.append(batch_el_boxes)
 
     assert len(in_patient_results_list) == len(out_patient_results_list), "batch dim needs to be maintained, in: {}, out {}".format(len(in_patient_results_list), len(out_patient_results_list))
 
     return [out_patient_results_list, pid]
 
 def nms_2to3D(dets, thresh):
     """
     Merges 2D boxes to 3D cubes. For this purpose, boxes of all slices are regarded as lying in one slice.
     An adaptation of Non-maximum suppression is applied where clusters are found (like in NMS) with the extra constraint
     that suppressed boxes have to have 'connected' z coordinates w.r.t the core slice (cluster center, highest
     scoring box, the prevailing box). 'connected' z-coordinates are determined
     as the z-coordinates with predictions until the first coordinate for which no prediction is found.
 
     example: a cluster of predictions was found overlap > iou thresh in xy (like NMS). The z-coordinate of the highest
     scoring box is 50. Other predictions have 23, 46, 48, 49, 51, 52, 53, 56, 57.
     Only the coordinates connected with 50 are clustered to one cube: 48, 49, 51, 52, 53. (46 not because nothing was
     found in 47, so 47 is a 'hole', which interrupts the connection). Only the boxes corresponding to these coordinates
     are suppressed. All others are kept for building of further clusters.
 
     This algorithm works better with a certain min_confidence of predictions, because low confidence (e.g. noisy/cluttery)
     predictions can break the relatively strong assumption of defining cubes' z-boundaries at the first 'hole' in the cluster.
 
     :param dets: (n_detections, (y1, x1, y2, x2, scores, slice_id)
     :param thresh: iou matchin threshold (like in NMS).
     :return: keep: (n_keep,) 1D tensor of indices to be kept.
     :return: keep_z: (n_keep, [z1, z2]) z-coordinates to be added to boxes, which are kept in order to form cubes.
     """
 
     y1 = dets[:, 0]
     x1 = dets[:, 1]
     y2 = dets[:, 2]
     x2 = dets[:, 3]
     assert np.all(y1 <= y2) and np.all(x1 <= x2), """"the definition of the coordinates is crucially important here: 
         where maximum is taken needs to be the lower coordinate"""
     scores = dets[:, -2]
     slice_id = dets[:, -1]
 
     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
     order = scores.argsort()[::-1]
 
     keep = []
     keep_z = []
 
     while order.size > 0:  # order is the sorted index.  maps order to index: order[1] = 24 means (rank1, ix 24)
         i = order[0]  # highest scoring element
         yy1 = np.maximum(y1[i], y1[order])  # highest scoring element still in >order<, is compared to itself: okay?
         xx1 = np.maximum(x1[i], x1[order])
         yy2 = np.minimum(y2[i], y2[order])
         xx2 = np.minimum(x2[i], x2[order])
 
         h = np.maximum(0.0, yy2 - yy1 + 1)
         w = np.maximum(0.0, xx2 - xx1 + 1)
         inter = h * w
 
         iou = inter / (areas[i] + areas[order] - inter)
         matches = np.argwhere(
             iou > thresh)  # get all the elements that match the current box and have a lower score
 
         slice_ids = slice_id[order[matches]]
         core_slice = slice_id[int(i)]
         upper_holes = [ii for ii in np.arange(core_slice, np.max(slice_ids)) if ii not in slice_ids]
         lower_holes = [ii for ii in np.arange(np.min(slice_ids), core_slice) if ii not in slice_ids]
         max_valid_slice_id = np.min(upper_holes) if len(upper_holes) > 0 else np.max(slice_ids)
         min_valid_slice_id = np.max(lower_holes) if len(lower_holes) > 0 else np.min(slice_ids)
         z_matches = matches[(slice_ids <= max_valid_slice_id) & (slice_ids >= min_valid_slice_id)]
 
         # expand by one z voxel since box content is surrounded w/o overlap, i.e., z-content computed as z2-z1
         z1 = np.min(slice_id[order[z_matches]]) - 1
         z2 = np.max(slice_id[order[z_matches]]) + 1
 
         keep.append(i)
         keep_z.append([z1, z2])
         order = np.delete(order, z_matches, axis=0)
 
     return keep, keep_z
 
 def apply_2d_3d_merging_to_patient(inputs):
     """
     wrapper around 2Dto3D merging operation. Processes a single patient. Takes 2D patient results (slices in batch dimension)
     and returns 3D patient results (dummy batch dimension of 1). Applies an adaption of Non-Maximum Surpression
     (Detailed methodology is described in nms_2to3D).
     :return. results_dict_boxes: list over batch elements (1 in 3D). each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]].
     :return. pid: string. patient id.
     """
 
     in_patient_results_list, pid, class_dict, merge_3D_iou = inputs
     out_patient_results_list = []
 
     for cl in list(class_dict.keys()):
         det_boxes, slice_ids = [], []
         # collect box predictions over batch dimension (slices) and store slice info as slice_ids.
         for batch_ix, batch in enumerate(in_patient_results_list):
             batch_element_det_boxes = [(ix, box) for ix, box in enumerate(batch) if
                                        (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             det_boxes += batch_element_det_boxes
             slice_ids += [batch_ix] * len(batch_element_det_boxes)
 
         box_coords = np.array([batch[1]['box_coords'] for batch in det_boxes])
         box_scores = np.array([batch[1]['box_score'] for batch in det_boxes])
         slice_ids = np.array(slice_ids)
 
         if 0 not in box_scores.shape:
             keep_ix, keep_z = nms_2to3D(
                 np.concatenate((box_coords, box_scores[:, None], slice_ids[:, None]), axis=1), merge_3D_iou)
         else:
             keep_ix, keep_z = [], []
 
         # store kept predictions in new results list and add corresponding z-dimension info to coordinates.
         for kix, kz in zip(keep_ix, keep_z):
             keep_box = det_boxes[kix][1]
             keep_box['box_coords'] = list(keep_box['box_coords']) + kz
             out_patient_results_list.append(keep_box)
 
     gt_boxes = [box for b in in_patient_results_list for box in b if box['box_type'] == 'gt']
     if len(gt_boxes) > 0:
         assert np.all([len(box["box_coords"]) == 6 for box in gt_boxes]), "expanded preds to 3D but GT is 2D."
     out_patient_results_list += gt_boxes
 
     return [[out_patient_results_list], pid]  # additional list wrapping is extra batch dim.
 
 
 class Predictor:
     """
 	    Prediction pipeline:
 	    - receives a patched patient image (n_patches, c, y, x, (z)) from patient data loader.
 	    - forwards patches through model in chunks of batch_size. (method: batch_tiling_forward)
 	    - unmolds predictions (boxes and segmentations) to original patient coordinates. (method: spatial_tiling_forward)
 
 	    Ensembling (mode == 'test'):
 	    - for inference, forwards 4 mirrored versions of image to through model and unmolds predictions afterwards
 	      accordingly (method: data_aug_forward)
 	    - for inference, loads multiple parameter-sets of the trained model corresponding to different epochs. for each
 	      parameter-set loops over entire test set, runs prediction pipeline for each patient. (method: predict_test_set)
 
 	    Consolidation of predictions:
 	    - consolidates a patient's predictions (boxes, segmentations) collected over patches, data_aug- and temporal ensembling,
 	      performs clustering and weighted averaging (external function: apply_wbc_to_patient) to obtain consistent outptus.
 	    - for 2D networks, consolidates box predictions to 3D cubes via clustering (adaption of non-maximum surpression).
 	      (external function: apply_2d_3d_merging_to_patient)
 
 	    Ground truth handling:
 	    - dissmisses any ground truth boxes returned by the model (happens in validation mode, patch-based groundtruth)
 	    - if provided by data loader, adds patient-wise ground truth to the final predictions to be passed to the evaluator.
     """
     def __init__(self, cf, net, logger, mode):
 
         self.cf = cf
         self.batch_size = cf.batch_size
         self.logger = logger
         self.mode = mode
         self.net = net
         self.n_ens = 1
         self.rank_ix = '0'
         self.regress_flag = any(['regression' in task for task in self.cf.prediction_tasks])
 
         if self.cf.merge_2D_to_3D_preds:
             assert self.cf.dim == 2, "Merge 2Dto3D only valid for 2D preds, but current dim is {}.".format(self.cf.dim)
 
         if self.mode == 'test':
             last_state_path = os.path.join(self.cf.fold_dir, 'last_state.pth')
             try:
                 self.model_index = torch.load(last_state_path)["model_index"]
                 self.model_index = self.model_index[self.model_index["rank"] <= self.cf.test_n_epochs]
             except FileNotFoundError:
                 raise FileNotFoundError('no last_state/model_index file in fold directory. '
                                    'seems like you are trying to run testing without prior training...')
             self.n_ens = cf.test_n_epochs
             if self.cf.test_aug_axes is not None:
                 self.n_ens *= (len(self.cf.test_aug_axes)+1)
             self.example_plot_dir = os.path.join(cf.test_dir, "example_plots")
             os.makedirs(self.example_plot_dir, exist_ok=True)
 
     def batch_tiling_forward(self, batch):
         """
         calls the actual network forward method. in patch-based prediction, the batch dimension might be overladed
         with n_patches >> batch_size, which would exceed gpu memory. In this case, batches are processed in chunks of
         batch_size. validation mode calls the train method to monitor losses (returned ground truth objects are discarded).
         test mode calls the test forward method, no ground truth required / involved.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - loss / class_loss (only in validation mode)
         """
 
         img = batch['data']
 
         if img.shape[0] <= self.batch_size:
 
             if self.mode == 'val':
                 # call training method to monitor losses
                 results_dict = self.net.train_forward(batch, is_validation=True)
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
             elif self.mode == 'test':
                 results_dict = self.net.test_forward(batch, return_masks=self.cf.return_masks_in_test)
 
         else: # needs batch tiling
             split_ixs = np.split(np.arange(img.shape[0]), np.arange(img.shape[0])[::self.batch_size])
             chunk_dicts = []
             for chunk_ixs in split_ixs[1:]:  # first split is elements before 0, so empty
                 b = {k: batch[k][chunk_ixs] for k in batch.keys()
                      if (isinstance(batch[k], np.ndarray) and batch[k].shape[0] == img.shape[0])}
                 if self.mode == 'val':
                     chunk_dicts += [self.net.train_forward(b, is_validation=True)]
                 else:
                     chunk_dicts += [self.net.test_forward(b, return_masks=self.cf.return_masks_in_test)]
 
             results_dict = {}
             # flatten out batch elements from chunks ([chunk, chunk] -> [b, b, b, b, ...])
             results_dict['boxes'] = [item for d in chunk_dicts for item in d['boxes']]
             results_dict['seg_preds'] = np.array([item for d in chunk_dicts for item in d['seg_preds']])
 
             if self.mode == 'val':
                 # if hasattr(self.cf, "losses_to_monitor"):
                 #     loss_names = self.cf.losses_to_monitor
                 # else:
                 #     loss_names = {name for dic in chunk_dicts for name in dic if 'loss' in name}
                 # estimate patient loss by mean over batch_chunks. Most similar to training loss.
                 results_dict['torch_loss'] = torch.mean(torch.cat([d['torch_loss'] for d in chunk_dicts]))
                 results_dict['class_loss'] = np.mean([d['class_loss'] for d in chunk_dicts])
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
 
         return results_dict
 
     def spatial_tiling_forward(self, batch, patch_crops = None, n_aug='0'):
         """
         forwards batch to batch_tiling_forward method and receives and returns a dictionary with results.
         if patch-based prediction, the results received from batch_tiling_forward will be on a per-patch-basis.
         this method uses the provided patch_crops to re-transform all predictions to whole-image coordinates.
         Patch-origin information of all box-predictions will be needed for consolidation, hence it is stored as
         'patch_id', which is a unique string for each patch (also takes current data aug and temporal epoch instances
         into account). all box predictions get additional information about the amount overlapping patches at the
         respective position (used for consolidation).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - monitor_values (only in validation mode)
         returned dict is a flattened version with 1 batch instance (3D) or slices (2D)
         """
 
         if patch_crops is not None:
             #print("patch_crops not None, applying patch center factor")
 
             patches_dict = self.batch_tiling_forward(batch)
             results_dict = {'boxes': [[] for _ in range(batch['original_img_shape'][0])]}
             #bc of ohe--> channel dim of seg has size num_classes
             out_seg_shape = list(batch['original_img_shape'])
             out_seg_shape[1] = patches_dict["seg_preds"].shape[1]
             out_seg_preds = np.zeros(out_seg_shape, dtype=np.float16)
             patch_overlap_map = np.zeros_like(out_seg_preds, dtype='uint8')
             for pix, pc in enumerate(patch_crops):
                 if self.cf.dim == 3:
                     out_seg_preds[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += patches_dict['seg_preds'][pix]
                     patch_overlap_map[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += 1
                 elif self.cf.dim == 2:
                     out_seg_preds[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += patches_dict['seg_preds'][pix]
                     patch_overlap_map[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += 1
 
             out_seg_preds[patch_overlap_map > 0] /= patch_overlap_map[patch_overlap_map > 0]
             results_dict['seg_preds'] = out_seg_preds
 
             for pix, pc in enumerate(patch_crops):
                 patch_boxes = patches_dict['boxes'][pix]
                 for box in patch_boxes:
 
                     # add unique patch id for consolidation of predictions.
                     box['patch_id'] = self.rank_ix + '_' + n_aug + '_' + str(pix)
                     # boxes from the edges of a patch have a lower prediction quality, than the ones at patch-centers.
                     # hence they will be down-weighted for consolidation, using the 'box_patch_center_factor', which is
                     # obtained by a gaussian distribution over positions in the patch and average over spatial dimensions.
                     # Also the info 'box_n_overlaps' is stored for consolidation, which represents the amount of
                     # overlapping patches at the box's position.
 
                     c = box['box_coords']
                     #box_centers = np.array([(c[ii] + c[ii+2])/2 for ii in range(len(c)//2)])
                     box_centers = [(c[ii] + c[ii + 2]) / 2 for ii in range(2)]
                     if self.cf.dim == 3:
                         box_centers.append((c[4] + c[5]) / 2)
                     box['box_patch_center_factor'] = np.mean(
                         [norm.pdf(bc, loc=pc, scale=pc * 0.8) * np.sqrt(2 * np.pi) * pc * 0.8 for bc, pc in
                          zip(box_centers, np.array(self.cf.patch_size) / 2)])
                     if self.cf.dim == 3:
                         c += np.array([pc[0], pc[2], pc[0], pc[2], pc[4], pc[4]])
                         int_c = [int(np.floor(ii)) if ix%2 == 0 else int(np.ceil(ii))  for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(patch_overlap_map[:, :, int_c[1]:int_c[3], int_c[0]:int_c[2], int_c[4]:int_c[5]])
                         results_dict['boxes'][0].append(box)
                     else:
                         c += np.array([pc[0], pc[2], pc[0], pc[2]])
                         int_c = [int(np.floor(ii)) if ix % 2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(
                             patch_overlap_map[pc[4], :, int_c[1]:int_c[3], int_c[0]:int_c[2]])
                         results_dict['boxes'][pc[4]].append(box)
 
             if self.mode == 'val':
                 results_dict['torch_loss'] = patches_dict['torch_loss']
                 results_dict['class_loss'] = patches_dict['class_loss']
 
         else:
             results_dict = self.batch_tiling_forward(batch)
             for b in results_dict['boxes']:
                 for box in b:
                     box['box_patch_center_factor'] = 1
                     box['box_n_overlaps'] = 1
                     box['patch_id'] = self.rank_ix + '_' + n_aug
 
         return results_dict
 
     def data_aug_forward(self, batch):
         """
         in val_mode: passes batch through to spatial_tiling method without data_aug.
         in test_mode: if cf.test_aug is set in configs, createst 4 mirrored versions of the input image,
         passes all of them to the next processing step (spatial_tiling method) and re-transforms returned predictions
         to original image version.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - loss / class_loss (only in validation mode)
         """
         patch_crops = batch['patch_crop_coords'] if self.patched_patient else None
         results_list = [self.spatial_tiling_forward(batch, patch_crops)]
         org_img_shape = batch['original_img_shape']
 
         if self.mode == 'test' and self.cf.test_aug_axes is not None:
             if isinstance(self.cf.test_aug_axes, (int, float)):
                 self.cf.test_aug_axes = (self.cf.test_aug_axes,)
             #assert np.all(np.array(self.cf.test_aug_axes)<self.cf.dim), "test axes {} need to be spatial axes".format(self.cf.test_aug_axes)
 
             if self.patched_patient:
                 # apply mirror transformations to patch-crop coordinates, for correct tiling in spatial_tiling method.
                 mirrored_patch_crops = get_mirrored_patch_crops_ax_dep(patch_crops, batch['original_img_shape'],
                                                                        self.cf.test_aug_axes)
                 self.logger.info("mirrored patch crop coords for patched patient in test augs!")
             else:
                 mirrored_patch_crops = [None] * 3
 
             img = np.copy(batch['data'])
 
             for n_aug, sp_axis in enumerate(self.cf.test_aug_axes):
                 #sp_axis = np.array(axis) #-2 #spatial axis index
                 axis = np.array(sp_axis)+2
                 if isinstance(sp_axis, (int, float)):
                     # mirroring along one axis at a time
                     batch['data'] = np.flip(img, axis=axis).copy()
                     chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[n_aug], n_aug=str(n_aug))
                     # re-transform coordinates.
                     for ix in range(len(chunk_dict['boxes'])):
                         for boxix in range(len(chunk_dict['boxes'][ix])):
                             coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                             coords[sp_axis] = org_img_shape[axis] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis+2]
                             coords[sp_axis+2] = org_img_shape[axis] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis]
                             assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords']]
                             assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords']]
                             chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
                     # re-transform segmentation predictions.
                     chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=axis)
 
                 elif hasattr(sp_axis, "__iter__") and tuple(sp_axis)==(0,1) or tuple(sp_axis)==(1,0):
                     #NEED: mirrored patch crops are given as [(y-axis), (x-axis), (y-,x-axis)], obey this order!
                     # mirroring along two axes at same time
                     batch['data'] = np.flip(np.flip(img, axis=axis[0]), axis=axis[1]).copy()
                     chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[n_aug], n_aug=str(n_aug))
                     # re-transform coordinates.
                     for ix in range(len(chunk_dict['boxes'])):
                         for boxix in range(len(chunk_dict['boxes'][ix])):
                             coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                             coords[sp_axis[0]] = org_img_shape[axis[0]] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis[0]+2]
                             coords[sp_axis[0]+2] = org_img_shape[axis[0]] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis[0]]
                             coords[sp_axis[1]] = org_img_shape[axis[1]] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis[1]+2]
                             coords[sp_axis[1]+2] = org_img_shape[axis[1]] - chunk_dict['boxes'][ix][boxix]['box_coords'][sp_axis[1]]
                             assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords']]
                             assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords']]
                             chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
                     # re-transform segmentation predictions.
                     chunk_dict['seg_preds'] = np.flip(np.flip(chunk_dict['seg_preds'], axis=axis[0]), axis=axis[1]).copy()
 
                 else:
                     raise Exception("Invalid axis type {} in test augs".format(type(axis)))
                 results_list.append(chunk_dict)
 
             batch['data'] = img
 
         # aggregate all boxes/seg_preds per batch element from data_aug predictions.
         results_dict = {}
         results_dict['boxes'] = [[item for d in results_list for item in d['boxes'][batch_instance]]
                                  for batch_instance in range(org_img_shape[0])]
         # results_dict['seg_preds'] = np.array([[item for d in results_list for item in d['seg_preds'][batch_instance]]
         #                                       for batch_instance in range(org_img_shape[0])])
         results_dict['seg_preds'] = np.stack([dic['seg_preds'] for dic in results_list], axis=1)
         # needs segs probs in seg_preds entry:
         results_dict['seg_preds'] = np.sum(results_dict['seg_preds'], axis=1) #add up seg probs from different augs per class
 
         if self.mode == 'val':
             results_dict['torch_loss'] = results_list[0]['torch_loss']
             results_dict['class_loss'] = results_list[0]['class_loss']
 
         return results_dict
 
     def load_saved_predictions(self):
         """loads raw predictions saved by self.predict_test_set. aggregates and/or merges 2D boxes to 3D cubes for
             evaluation (if model predicts 2D but evaluation is run in 3D), according to settings config.
         :return: list_of_results_per_patient: list over patient results. each entry is a dict with keys:
             - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                        one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                        (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
             - 'batch_dices': dice scores as recorded in raw prediction results.
             - 'seg_preds': not implemented yet. could replace dices by seg preds to have raw seg info available, however
                 would consume critically large memory amount. todo evaluation of instance/semantic segmentation.
         """
 
         results_file = 'pred_results.pkl' if not self.cf.held_out_test_set else 'pred_results_held_out.pkl'
         if not self.cf.held_out_test_set or self.cf.eval_test_fold_wise:
             self.logger.info("loading saved predictions of fold {}".format(self.cf.fold))
             with open(os.path.join(self.cf.fold_dir, results_file), 'rb') as handle:
                 results_list = pickle.load(handle)
             box_results_list = [(res_dict["boxes"], pid) for res_dict, pid in results_list]
 
             da_factor = len(self.cf.test_aug_axes)+1 if self.cf.test_aug_axes is not None else 1
             self.n_ens = self.cf.test_n_epochs * da_factor
             self.logger.info('loaded raw test set predictions with n_patients = {} and n_ens = {}'.format(
                 len(results_list), self.n_ens))
         else:
             self.logger.info("loading saved predictions of hold-out test set")
             fold_dirs = sorted([os.path.join(self.cf.exp_dir, f) for f in os.listdir(self.cf.exp_dir) if
                                 os.path.isdir(os.path.join(self.cf.exp_dir, f)) and f.startswith("fold")])
 
             results_list = []
             folds_loaded = 0
             for fold in range(self.cf.n_cv_splits):
                 fold_dir = os.path.join(self.cf.exp_dir, 'fold_{}'.format(fold))
                 if fold_dir in fold_dirs:
                     with open(os.path.join(fold_dir, results_file), 'rb') as handle:
                         fold_list = pickle.load(handle)
                         results_list += fold_list
                         folds_loaded += 1
                 else:
                     self.logger.info("Skipping fold {} since no saved predictions found.".format(fold))
             box_results_list = []
             for res_dict, pid in results_list: #without filtering gt out:
                 box_results_list.append((res_dict['boxes'], pid))
                 #it's usually not right to filter out gts here, is it?
 
             da_factor = len(self.cf.test_aug_axes)+1 if self.cf.test_aug_axes is not None else 1
             self.n_ens = self.cf.test_n_epochs * da_factor * folds_loaded
 
         # -------------- aggregation of boxes via clustering -----------------
 
         if self.cf.clustering == "wbc":
             self.logger.info('applying WBC to test-set predictions with iou {} and n_ens {} over {} patients'.format(
                 self.cf.clustering_iou, self.n_ens, len(box_results_list)))
 
             mp_inputs = [[self.regress_flag, ii[0], ii[1], self.cf.class_dict, self.cf.clustering_iou, self.n_ens] for ii
                          in box_results_list]
             del box_results_list
             pool = Pool(processes=self.cf.n_workers)
             box_results_list = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
             del mp_inputs
         elif self.cf.clustering == "nms":
             self.logger.info('applying standard NMS to test-set predictions with iou {} over {} patients.'.format(
                 self.cf.clustering_iou, len(box_results_list)))
             pool = Pool(processes=self.cf.n_workers)
             mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.clustering_iou] for ii in box_results_list]
             del box_results_list
             box_results_list = pool.map(apply_nms_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
             del mp_inputs
 
         if self.cf.merge_2D_to_3D_preds:
             self.logger.info('applying 2Dto3D merging to test-set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
             pool = Pool(processes=self.cf.n_workers)
             mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in box_results_list]
             box_results_list = pool.map(apply_2d_3d_merging_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
             del mp_inputs
 
         for ix in range(len(results_list)):
             assert np.all(results_list[ix][1] == box_results_list[ix][1]), "pid mismatch between loaded and aggregated results"
             results_list[ix][0]["boxes"] = box_results_list[ix][0]
 
         return results_list # holds (results_dict, pid)
 
     def predict_patient(self, batch):
         """
         predicts one patient.
         called either directly via loop over validation set in exec.py (mode=='val')
         or from self.predict_test_set (mode=='test).
         in val mode:  adds 3D ground truth info to predictions and runs consolidation and 2Dto3D merging of predictions.
         in test mode: returns raw predictions (ground truth addition, consolidation, 2D to 3D merging are
                       done in self.predict_test_set, because patient predictions across several epochs might be needed
                       to be collected first, in case of temporal ensembling).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
                  - loss / class_loss (only in validation mode)
         """
         #if self.mode=="test":
         #    self.logger.info('predicting patient {} for fold {} '.format(np.unique(batch['pid']), self.cf.fold))
 
         # True if patient is provided in patches and predictions need to be tiled.
         self.patched_patient = 'patch_crop_coords' in list(batch.keys())
 
         # forward batch through prediction pipeline.
         results_dict = self.data_aug_forward(batch)
         #has seg probs in entry 'seg_preds'
 
         if self.mode == 'val':
             for b in range(batch['patient_bb_target'].shape[0]):
                 for t in range(len(batch['patient_bb_target'][b])):
                     gt_box = {'box_type': 'gt', 'box_coords': batch['patient_bb_target'][b][t],
                               'class_targets': batch['patient_class_targets'][b][t]}
                     for name in self.cf.roi_items:
                         gt_box.update({name : batch['patient_'+name][b][t]})
                     results_dict['boxes'][b].append(gt_box)
 
             if 'dice' in self.cf.metrics:
                 if self.patched_patient:
                     assert 'patient_seg' in batch.keys(), "Results_dict preds are in original patient shape."
                 results_dict['batch_dices'] = mutils.dice_per_batch_and_class(
                     results_dict['seg_preds'], batch["patient_seg"] if self.patched_patient else batch['seg'],
                     self.cf.num_seg_classes, convert_to_ohe=True)
             if self.patched_patient and self.cf.clustering == "wbc":
                 wbc_input = [self.regress_flag, results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.clustering_iou, self.n_ens]
                 results_dict['boxes'] = apply_wbc_to_patient(wbc_input)[0]
             elif self.patched_patient:
                 nms_inputs = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.clustering_iou]
                 results_dict['boxes'] = apply_nms_to_patient(nms_inputs)[0]
 
             if self.cf.merge_2D_to_3D_preds:
                 results_dict['2D_boxes'] = results_dict['boxes']
                 merge_dims_inputs = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.merge_3D_iou]
                 results_dict['boxes'] = apply_2d_3d_merging_to_patient(merge_dims_inputs)[0]
 
         return results_dict
 
     def predict_test_set(self, batch_gen, return_results=True):
         """
         wrapper around test method, which loads multiple (or one) epoch parameters (temporal ensembling), loops through
         the test set and collects predictions per patient. Also flattens the results per patient and epoch
         and adds optional ground truth boxes for evaluation. Saves out the raw result list for later analysis and
         optionally consolidates and returns predictions immediately.
         :return: (optionally) list_of_results_per_patient: list over patient results. each entry is a dict with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': not implemented yet. todo evaluation of instance/semantic segmentation.
         """
 
         # -------------- raw predicting -----------------
         dict_of_patients_results = OrderedDict()
         set_of_result_types = set()
 
         self.model_index = self.model_index.sort_values(by="rank")
         # get paths of all parameter sets to be loaded for temporal ensembling. (or just one for no temp. ensembling).
         weight_paths = [os.path.join(self.cf.fold_dir, file_name) for file_name in self.model_index["file_name"]]
 
 
         for rank_ix, weight_path in enumerate(weight_paths):
             self.logger.info(('tmp ensembling over rank_ix:{} epoch:{}'.format(rank_ix, weight_path)))
             self.net.load_state_dict(torch.load(weight_path))
             self.net.eval()
             self.rank_ix = str(rank_ix)
             with torch.no_grad():
-                plot_batches = np.random.choice(np.arange(batch_gen['n_test']), size=self.cf.n_test_plots, replace=False)
+                plot_batches = np.random.choice(np.arange(batch_gen['n_test']),
+                                                size=min(batch_gen['n_test'], self.cf.n_test_plots), replace=False)
                 for i in range(batch_gen['n_test']):
                     batch = next(batch_gen['test'])
                     pid = np.unique(batch['pid'])
                     assert len(pid)==1
                     pid = pid[0]
 
                     if not pid in dict_of_patients_results.keys():  # store batch info in patient entry of results dict.
                         dict_of_patients_results[pid] = {}
                         dict_of_patients_results[pid]['results_dicts'] = []
                         dict_of_patients_results[pid]['patient_bb_target'] = batch['patient_bb_target']
 
                         for name in self.cf.roi_items:
                             dict_of_patients_results[pid]["patient_"+name] = batch["patient_"+name]
                     stime = time.time()
                     results_dict = self.predict_patient(batch) #only holds "boxes", "seg_preds"
                     # needs ohe seg probs in seg_preds entry:
                     results_dict['seg_preds'] = np.argmax(results_dict['seg_preds'], axis=1)[:,np.newaxis]
                     self.logger.info("predicting patient {} with weight rank {} (progress: {}/{}) took {:.2f}s".format(
                         str(pid), rank_ix, (rank_ix)*batch_gen['n_test']+(i+1), len(weight_paths)*batch_gen['n_test'], time.time()-stime))
 
                     if i in plot_batches and (not self.patched_patient or 'patient_data' in batch.keys()):
                         try:
                             # view qualitative results of random test case
                             self.logger.time("test_plot")
                             out_file = os.path.join(self.example_plot_dir,
                                                     'batch_example_test_{}_rank_{}.png'.format(self.cf.fold, rank_ix))
                             utils.split_off_process(plg.view_batch, self.cf, batch, results_dict,
                                                     has_colorchannels=self.cf.has_colorchannels,
                                                     show_gt_labels=True, show_seg_ids='dice' in self.cf.metrics,
                                                     get_time="test-example plot", out_file=out_file)
                         except Exception as e:
                             self.logger.info("WARNING: error in view_batch: {}".format(e))
 
                     if 'dice' in self.cf.metrics:
                         if self.patched_patient:
                             assert 'patient_seg' in batch.keys(), "Results_dict preds are in original patient shape."
                         results_dict['batch_dices'] = mutils.dice_per_batch_and_class( results_dict['seg_preds'],
                                 batch["patient_seg"] if self.patched_patient else batch['seg'],
                                 self.cf.num_seg_classes, convert_to_ohe=True)
 
                     dict_of_patients_results[pid]['results_dicts'].append({k:v for k,v in results_dict.items()
                                                                            if k in ["boxes", "batch_dices"]})
                     # collect result types to know which ones to look for when saving
                     set_of_result_types.update(dict_of_patients_results[pid]['results_dicts'][-1].keys())
 
 
 
         # -------------- re-order, save raw results -----------------
         self.logger.info('finished predicting test set. starting aggregation of predictions.')
         results_per_patient = []
         for pid, p_dict in dict_of_patients_results.items():
         # dict_of_patients_results[pid]['results_list'] has length batch['n_test']
 
             results_dict = {}
             # collect all boxes/seg_preds of same batch_instance over temporal instances.
             b_size = len(p_dict['results_dicts'][0]["boxes"])
             for res_type in [rtype for rtype in set_of_result_types if rtype in ["boxes", "batch_dices"]]:#, "seg_preds"]]:
                 if not 'batch' in res_type: #assume it's results on batch-element basis
                     results_dict[res_type] = [[item for rank_dict in p_dict['results_dicts'] for item in rank_dict[res_type][batch_instance]]
                                              for batch_instance in range(b_size)]
                 else:
                     results_dict[res_type] = []
                     for dict in p_dict['results_dicts']:
                         if 'dice' in res_type:
                             item = dict[res_type] #dict['batch_dices'] has shape (num_seg_classes,)
                             assert len(item) == self.cf.num_seg_classes, \
                                 "{}, {}".format(len(item), self.cf.num_seg_classes)
                         else:
                             raise NotImplementedError
                         results_dict[res_type].append(item)
                     # rdict[dice] shape (n_rank_epochs (n_saved_ranks), nsegclasses)
                     # calc mean over test epochs so inline with shape from sampling
                     results_dict[res_type] = np.mean(results_dict[res_type], axis=0) #maybe error type with other than dice
 
             if not hasattr(self.cf, "eval_test_separately") or not self.cf.eval_test_separately:
                 # add unpatched 2D or 3D (if dim==3 or merge_2D_to_3D) ground truth boxes for evaluation.
                 for b in range(p_dict['patient_bb_target'].shape[0]):
                     for targ in range(len(p_dict['patient_bb_target'][b])):
                         gt_box = {'box_type': 'gt', 'box_coords':p_dict['patient_bb_target'][b][targ],
                                   'class_targets': p_dict['patient_class_targets'][b][targ]}
                         for name in self.cf.roi_items:
                             gt_box.update({name: p_dict["patient_"+name][b][targ]})
                         results_dict['boxes'][b].append(gt_box)
 
             results_per_patient.append([results_dict, pid])
 
         out_string = 'pred_results_held_out' if self.cf.held_out_test_set else 'pred_results'
         with open(os.path.join(self.cf.fold_dir, '{}.pkl'.format(out_string)), 'wb') as handle:
             pickle.dump(results_per_patient, handle)
 
         if return_results:
             # -------------- results processing, clustering, etc. -----------------
             final_patient_box_results = [ (res_dict["boxes"], pid) for res_dict,pid in results_per_patient ]
             if self.cf.clustering == "wbc":
                 self.logger.info('applying WBC to test-set predictions with iou = {} and n_ens = {}.'.format(
                     self.cf.clustering_iou, self.n_ens))
                 mp_inputs = [[self.regress_flag, ii[0], ii[1], self.cf.class_dict, self.cf.clustering_iou, self.n_ens] for ii in final_patient_box_results]
                 del final_patient_box_results
                 pool = Pool(processes=self.cf.n_workers)
                 final_patient_box_results = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
                 pool.close()
                 pool.join()
                 del mp_inputs
             elif self.cf.clustering == "nms":
                 self.logger.info('applying standard NMS to test-set predictions with iou = {}.'.format(self.cf.clustering_iou))
                 pool = Pool(processes=self.cf.n_workers)
                 mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.clustering_iou] for ii in final_patient_box_results]
                 del final_patient_box_results
                 final_patient_box_results = pool.map(apply_nms_to_patient, mp_inputs, chunksize=1)
                 pool.close()
                 pool.join()
                 del mp_inputs
 
             if self.cf.merge_2D_to_3D_preds:
                 self.logger.info('applying 2D-to-3D merging to test-set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
                 mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in final_patient_box_results]
                 del final_patient_box_results
                 pool = Pool(processes=self.cf.n_workers)
                 final_patient_box_results = pool.map(apply_2d_3d_merging_to_patient, mp_inputs, chunksize=1)
                 pool.close()
                 pool.join()
                 del mp_inputs
             # final_patient_box_results holds [avg_boxes, pid] if wbc
             for ix in range(len(results_per_patient)):
                 assert results_per_patient[ix][1] == final_patient_box_results[ix][1], "should be same pid"
                 results_per_patient[ix][0]["boxes"] = final_patient_box_results[ix][0]
             # results_per_patient = [(res_dict["boxes"] = boxes, pid) for (boxes,pid) in final_patient_box_results]
 
             return results_per_patient # holds list of (results_dict, pid)