diff --git a/evaluator.py b/evaluator.py
index 68db83a..9bc4de3 100644
--- a/evaluator.py
+++ b/evaluator.py
@@ -1,437 +1,485 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os
 import numpy as np
 import pandas as pd
+import torch
+
 from sklearn.metrics import roc_auc_score, average_precision_score
 from sklearn.metrics import roc_curve, precision_recall_curve
 import utils.model_utils as mutils
 import plotting
 from multiprocessing import Pool
 
 
 class Evaluator():
 
 
     def __init__(self, cf, logger, mode='test'):
         """
         :param mode: either 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms.
         """
         self.cf = cf
         self.logger = logger
         self.mode = mode
 
 
-    def evaluate_predictions(self, results_list, monitor_metrics=None):
-        """
-        Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
-        Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
-        det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
-        pred_class: foreground class which the object predicts.
-        pid: corresponding patient-id.
-        pred_score: confidence score [0, 1]
-        fold: corresponding fold of CV.
-        match_iou: utilized IoU for matching.
-        :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
-        [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
-        Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
-        :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
-        :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
-        """
-        # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches
-        # we want to evaluate one batch_instance (= 2D or 3D image) at a time.
+    def eval_losses(self, batch_res_dicts):
+        if hasattr(self.cf, "losses_to_monitor"):
+            loss_names = self.cf.losses_to_monitor
+        else:
+            loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name}
+        self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name
+                                                   in b_res_dict.keys()]).mean().item() for l_name in loss_names}
+
+    def eval_boxes(self, batch_res_dicts, pid_list):
+        """ """
 
         df_list_preds = []
         df_list_labels = []
         df_list_class_preds = []
         df_list_pids = []
         df_list_type = []
         df_list_match_iou = []
 
-        self.logger.info('evaluating in mode {}'.format(self.mode))
-
 
         if self.mode == 'train' or self.mode=='val_sampling':
+            # one pid per batch element
             # batch_size > 1, with varying patients across batch:
             # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
-            # -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
-            batch_elements_list = [[b_box_list] for item in results_list for b_box_list in item[0]]
-            pid_list = [pid for item in results_list for pid in item[1]]
+            # -> [results_0, results_1, ..]
+            batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]  # len: nr of batches in epoch
+            batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in
+                                whole_batch_boxes]
         else:
             # patient processing, one element per batch = one patient.
-            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
-            batch_elements_list = [item[0] for item in results_list]
-            pid_list = [item[1] for item in results_list]
+            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..]
+            batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]
+
+        assert len(batch_inst_boxes) == len(pid_list)
 
         for match_iou in self.cf.ap_match_ious:
             self.logger.info('evaluating with match_iou: {}'.format(match_iou))
             for cl in list(self.cf.class_dict.keys()):
                 for pix, pid in enumerate(pid_list):
 
                     len_df_list_before_patient = len(df_list_pids)
 
                     # input of each batch element is a list of boxes, where each box is a dictionary.
-                    for bix, b_boxes_list in enumerate(batch_elements_list[pix]):
+                    for bix, b_boxes_list in enumerate(batch_inst_boxes[pix]):
 
                         b_tar_boxes = np.array([box['box_coords'] for box in b_boxes_list if
                                                 (box['box_type'] == 'gt' and box['box_label'] == cl)])
                         b_cand_boxes = np.array([box['box_coords'] for box in b_boxes_list if
                                                  (box['box_type'] == 'det' and
                                                   box['box_pred_class_id'] == cl)])
                         b_cand_scores = np.array([box['box_score'] for box in b_boxes_list if
                                                   (box['box_type'] == 'det' and
                                                    box['box_pred_class_id'] == cl)])
 
                         # check if predictions and ground truth boxes exist and match them according to match_iou.
                         if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                             overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes)
                             match_cand_ixs = np.argwhere(np.max(overlaps, 1) > match_iou)[:, 0]
                             non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0]
                             match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :],
                                                      1) if not 0 in match_cand_ixs.shape else np.array([])
                             non_match_gt_ixs = np.array(
                                 [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs])
                             unique, counts = np.unique(match_gt_ixs, return_counts=True)
 
                             # check for double assignments, i.e. two predictions having been assigned to the same gt.
                             # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
                             # false positive. This case is supposed to be avoided by the model itself by,
                             #  e.g. using a low enough NMS threshold.
                             if np.any(counts > 1):
                                 double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]]
                                 keep_max = []
                                 double_match_list = []
                                 for dg in double_match_gt_ixs:
                                     double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)]
                                     keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])])
                                     double_match_list += [ii for ii in double_match_cand_ixs]
 
                                 fp_ixs = np.array([ii for ii in match_cand_ixs if
                                                      (ii in double_match_list and ii not in keep_max)])
 
                                 match_cand_ixs = np.array([ii for ii in match_cand_ixs if ii not in fp_ixs])
 
                                 df_list_preds += [ii for ii in b_cand_scores[fp_ixs]]
                                 df_list_labels += [0] * fp_ixs.shape[0]
                                 df_list_class_preds += [cl] * fp_ixs.shape[0]
                                 df_list_pids += [pid] * fp_ixs.shape[0]
                                 df_list_type += ['det_fp'] * fp_ixs.shape[0]
 
                             # matched:
                             if not 0 in match_cand_ixs.shape:
                                 df_list_preds += [ii for ii in b_cand_scores[match_cand_ixs]]
                                 df_list_labels += [1] * match_cand_ixs.shape[0]
                                 df_list_class_preds += [cl] * match_cand_ixs.shape[0]
                                 df_list_pids += [pid] * match_cand_ixs.shape[0]
                                 df_list_type += ['det_tp'] * match_cand_ixs.shape[0]
                             # rest fp:
                             if not 0 in non_match_cand_ixs.shape:
                                 df_list_preds += [ii for ii in b_cand_scores[non_match_cand_ixs]]
                                 df_list_labels += [0] * non_match_cand_ixs.shape[0]
                                 df_list_class_preds += [cl] * non_match_cand_ixs.shape[0]
                                 df_list_pids += [pid] * non_match_cand_ixs.shape[0]
                                 df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0]
                             # rest fn:
                             if not 0 in non_match_gt_ixs.shape:
                                 df_list_preds += [0] * non_match_gt_ixs.shape[0]
                                 df_list_labels += [1] * non_match_gt_ixs.shape[0]
                                 df_list_class_preds += [cl] * non_match_gt_ixs.shape[0]
                                 df_list_pids += [pid]  * non_match_gt_ixs.shape[0]
                                 df_list_type += ['det_fn']  * non_match_gt_ixs.shape[0]
                         # only fp:
                         if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
                             df_list_preds += [ii for ii in b_cand_scores]
                             df_list_labels += [0] * b_cand_scores.shape[0]
                             df_list_class_preds += [cl] * b_cand_scores.shape[0]
                             df_list_pids += [pid] * b_cand_scores.shape[0]
                             df_list_type += ['det_fp'] * b_cand_scores.shape[0]
                         # only fn:
                         if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                             df_list_preds += [0] * b_tar_boxes.shape[0]
                             df_list_labels += [1] * b_tar_boxes.shape[0]
                             df_list_class_preds += [cl] * b_tar_boxes.shape[0]
                             df_list_pids += [pid] * b_tar_boxes.shape[0]
                             df_list_type += ['det_fn'] * b_tar_boxes.shape[0]
 
                     # empty patient with 0 detections needs patient dummy score, in order to not disappear from stats.
                     # filtered out for roi-level evaluation later. During training (and val_sampling),
                     # tn are assigned per sample independently of associated patients.
                     if len(df_list_pids) == len_df_list_before_patient:
                         df_list_preds += [0] * 1
                         df_list_labels += [0] * 1
                         df_list_class_preds += [cl] * 1
                         df_list_pids += [pid] * 1
                         df_list_type += ['patient_tn'] * 1 # true negative: no ground truth boxes, no detections.
 
             df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou))
 
         self.test_df = pd.DataFrame()
         self.test_df['pred_score'] = df_list_preds
         self.test_df['class_label'] = df_list_labels
         self.test_df['pred_class'] = df_list_class_preds
         self.test_df['pid'] = df_list_pids
         self.test_df['det_type'] = df_list_type
         self.test_df['fold'] = self.cf.fold
         self.test_df['match_iou'] = df_list_match_iou
+
+
+    def evaluate_predictions(self, results_list, monitor_metrics=None):
+        """
+        Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
+        Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
+        det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
+        pred_class: foreground class which the object predicts.
+        pid: corresponding patient-id.
+        pred_score: confidence score [0, 1]
+        fold: corresponding fold of CV.
+        match_iou: utilized IoU for matching.
+        :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
+        [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
+        Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
+        :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
+        :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
+        """
+
+        self.logger.info('evaluating in mode {}'.format(self.mode))
+
+        batch_res_dicts = [batch[0] for batch in results_list]  # len: nr of batches in epoch
+        if self.mode == 'train' or self.mode == 'val_sampling':
+            # one pid per batch element
+            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
+            # -> [pid_0, pid_1, ...]
+            # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance
+            pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]]
+        elif self.mode == "val_patient" or self.mode == "test":
+            # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...]
+            # in patientbatchiterator there is only one pid per batch
+            pid_list = [np.unique(batch[1]) for batch in results_list]
+            assert np.all([len(pid) == 1 for pid in
+                           pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(
+                pid_list)
+            pid_list = [pid[0] for pid in pid_list]
+            # todo remove assert
+            pid_list_orig = [item[1] for item in results_list]
+            assert np.all(pid_list == pid_list_orig)
+        else:
+            raise Exception("undefined run mode encountered")
+
+        self.eval_losses(batch_res_dicts)
+        self.eval_boxes(batch_res_dicts, pid_list)
+
         if monitor_metrics is not None:
+            # return all_stats, updated monitor_metrics
             return self.return_metrics(monitor_metrics)
 
 
     def return_metrics(self, monitor_metrics=None):
         """
         calculates AP/AUC scores for internal dataframe. called directly from evaluate_predictions during training for monitoring,
         or from score_test_df during inference (for single folds or aggregated test set). Loops over foreground classes
         and score_levels (typically 'roi' and 'patient'), gets scores and stores them. Optionally creates plots of
         prediction histograms and roc/prc curves.
         :param monitor_metrics: dict of dicts with all metrics of previous epochs.
         this function adds metrics for current epoch and returns the same object.
         :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and
         score_level.
         :return: monitor_metrics
         """
+
+        # -------------- monitoring independent of class, score level ------------
+        if monitor_metrics is not None:
+            for l_name in self.epoch_losses:
+                monitor_metrics[l_name] = [self.epoch_losses[l_name]]
+
+
         df = self.test_df
 
         all_stats = []
         for cl in list(self.cf.class_dict.keys()):
             cl_df = df[df.pred_class == cl]
 
             for score_level in self.cf.report_score_level:
                 stats_dict = {}
                 stats_dict['name'] = 'fold_{} {} cl_{}'.format(self.cf.fold, score_level, cl)
 
                 if score_level == 'rois':
                     # kick out dummy entries for true negative patients. not needed on roi-level.
                     spec_df = cl_df[cl_df.det_type != 'patient_tn']
                     stats_dict['ap'] = get_roi_ap_from_df([spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap])
                     # AUC not sensible on roi-level, since true negative box predictions do not exist. Would reward
                     # higher amounts of low confidence false positives.
                     stats_dict['auc'] = np.nan
                     stats_dict['roc'] = np.nan
                     stats_dict['prc'] = np.nan
 
                     # for the aggregated test set case, additionally get the scores for averaging over fold results.
                     if len(df.fold.unique()) > 1:
                         aps = []
                         for fold in df.fold.unique():
                             fold_df = spec_df[spec_df.fold == fold]
                             aps.append(get_roi_ap_from_df([fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap]))
                         stats_dict['mean_ap'] = np.mean(aps)
                         stats_dict['mean_auc'] = 0
 
                 # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest
                 # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0.
                 if score_level == 'patient':
                     spec_df = cl_df.groupby(['pid'], as_index=False).agg({'class_label': 'max', 'pred_score': 'max', 'fold': 'first'})
 
                     if len(spec_df.class_label.unique()) > 1:
                         stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                         stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                     else:
                         stats_dict['auc'] = np.nan
                         stats_dict['roc'] = np.nan
 
                     if (spec_df.class_label == 1).any():
                         stats_dict['ap'] = average_precision_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                         stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                     else:
                         stats_dict['ap'] = np.nan
                         stats_dict['prc'] = np.nan
 
                     # for the aggregated test set case, additionally get the scores for averaging over fold results.
                     if len(df.fold.unique()) > 1:
                         aucs = []
                         aps = []
                         for fold in df.fold.unique():
                             fold_df = spec_df[spec_df.fold == fold]
                             if len(fold_df.class_label.unique()) > 1:
                                 aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
                             if (fold_df.class_label == 1).any():
                                 aps.append(average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
                         stats_dict['mean_auc'] = np.mean(aucs)
                         stats_dict['mean_ap'] = np.mean(aps)
 
                 # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level.
                 if monitor_metrics is not None and not (score_level == 'patient' and cl != self.cf.patient_class_of_interest):
                     score_level_name = 'patient' if score_level == 'patient' else self.cf.class_dict[cl]
                     monitor_metrics[score_level_name + '_ap'].append(stats_dict['ap'] if stats_dict['ap'] > 0 else np.nan)
                     if score_level == 'patient':
                         monitor_metrics[score_level_name + '_auc'].append(
                             stats_dict['auc'] if stats_dict['auc'] > 0 else np.nan)
 
                 if self.cf.plot_prediction_histograms:
                     out_filename = os.path.join(
                         self.cf.plot_dir, 'pred_hist_{}_{}_{}_cl{}'.format(
                             self.cf.fold, 'val' if 'val' in self.mode else self.mode, score_level, cl))
                     type_list = None if score_level == 'patient' else spec_df.det_type.tolist()
                     plotting.plot_prediction_hist(spec_df.class_label.tolist(), spec_df.pred_score.tolist(), type_list, out_filename)
 
                 all_stats.append(stats_dict)
 
                 # analysis of the  hyper-parameter cf.min_det_thresh, for optimization on validation set.
                 if self.cf.scan_det_thresh:
                     conf_threshs = list(np.arange(0.9, 1, 0.01))
                     pool = Pool(processes=10)
                     mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs]
                     aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1)
                     pool.close()
                     pool.join()
                     self.logger.info('results from scanning over det_threshs:', [[i, j] for i, j in zip(conf_threshs, aps)])
 
         if self.cf.plot_stat_curves:
             out_filename = os.path.join(self.cf.plot_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode))
             plotting.plot_stat_curves(all_stats, out_filename)
 
 
         # get average stats over foreground classes on roi level.
         avg_ap = np.mean([d['ap'] for d in all_stats if 'rois' in d['name']])
         all_stats.append({'name': 'average_foreground_roi', 'auc': 0, 'ap': avg_ap})
         if len(df.fold.unique()) > 1:
             avg_mean_ap = np.mean([d['mean_ap'] for d in all_stats if 'rois' in d['name']])
             all_stats[-1]['mean_ap'] = avg_mean_ap
             all_stats[-1]['mean_auc'] = 0
 
         # in small data sets, values of model_selection_criterion can be identical across epochs, wich breaks the
         # ranking of model_selector. Thus, pertube identical values by a neglectibale random term.
         for sc in self.cf.model_selection_criteria:
             if 'val' in self.mode and monitor_metrics[sc].count(monitor_metrics[sc][-1]) > 1 and monitor_metrics[sc][-1] is not None:
                 monitor_metrics[sc][-1] += 1e-6 * np.random.rand()
 
         return all_stats, monitor_metrics
 
 
     def score_test_df(self, internal_df=True):
         """
         Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold,
         gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of
         all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds,
         and calculates and writes out overall metrics.
         """
         if internal_df:
 
             self.test_df.to_pickle(os.path.join(self.cf.exp_dir, '{}_test_df.pickle'.format(self.cf.fold)))
             stats, _ = self.return_metrics()
 
             with open(os.path.join(self.cf.exp_dir, 'results.txt'), 'a') as handle:
                 handle.write('\n****************************\n')
                 handle.write('\nresults for fold {} \n'.format(self.cf.fold))
                 handle.write('\n****************************\n')
                 handle.write('\nfold df shape {}\n  \n'.format(self.test_df.shape))
                 for s in stats:
                     handle.write('AUC {:0.4f}  AP {:0.4f} {} \n'.format(s['auc'], s['ap'], s['name']))
 
         fold_df_paths = [ii for ii in os.listdir(self.cf.exp_dir) if 'test_df.pickle' in ii]
         if len(fold_df_paths) == self.cf.n_cv_splits:
             with open(os.path.join(self.cf.exp_dir, 'results.txt'), 'a') as handle:
                 self.cf.fold = 'overall'
                 dfs_list = [pd.read_pickle(os.path.join(self.cf.exp_dir, ii)) for ii in fold_df_paths]
                 for ix, df in enumerate(dfs_list):
                     df['fold'] = ix
                 self.test_df = pd.concat(dfs_list)
                 stats, _ = self.return_metrics()
                 handle.write('\n****************************\n')
                 handle.write('\nOVERALL RESULTS \n')
                 handle.write('\n****************************\n')
                 handle.write('\ndf shape \n  \n'.format(self.test_df.shape))
                 for s in stats:
                     handle.write('\nAUC {:0.4f} (mu {:0.4f})  AP {:0.4f} (mu {:0.4f})  {}\n '
                                  .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name']))
                 results_table_path = os.path.join(("/").join(self.cf.exp_dir.split("/")[:-1]), 'results_table.txt')
                 with open(results_table_path, 'a') as handle2:
                     for s in stats:
                         handle2.write('\nAUC {:0.4f} (mu {:0.4f})  AP {:0.4f} (mu {:0.4f})  {} {}'
                                       .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name'], self.cf.exp_dir.split('/')[-1]))
                     handle2.write('\n')
 
 
 
 def get_roi_ap_from_df(inputs):
     '''
     :param df: data frame.
     :param det_thresh: min_threshold for filtering out low confidence predictions.
     :param per_patient_ap: boolean flag. evaluate average precision per image and average over images,
     instead of computing one ap over data set.
     :return: average_precision (float)
     '''
     df, det_thresh, per_patient_ap = inputs
 
     if per_patient_ap:
         pids_list = df.pid.unique()
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             for pid in pids_list:
                 pid_df = iou_df[iou_df.pid == pid]
                 all_p = len(pid_df[pid_df.class_label == 1])
                 pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
                 pid_df = pid_df[pid_df.pred_score > det_thresh]
                 if (len(pid_df) ==0 and all_p == 0):
                    pass
                 elif (len(pid_df) > 0 and all_p == 0):
                     aps.append(0)
                 else:
                     aps.append(compute_roi_ap(pid_df, all_p))
         return np.mean(aps)
 
     else:
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             all_p = len(iou_df[iou_df.class_label == 1])
             iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
             iou_df = iou_df[iou_df.pred_score > det_thresh]
             if all_p > 0:
                 aps.append(compute_roi_ap(iou_df, all_p))
         return np.mean(aps)
 
 
 
 def compute_roi_ap(df, all_p):
     """
     adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
     :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score.
     :param all_p: number of all ground truth objects. (for denominator of recall.)
     :return:
     """
     tp = df.class_label.values
     fp = (tp == 0) * 1
     #recall thresholds, where precision will be measured
     R = np.linspace(.0, 1, 101, endpoint=True)
     tp_sum = np.cumsum(tp)
     fp_sum = np.cumsum(fp)
     nd = len(tp)
     rc = tp_sum / all_p
     pr = tp_sum / (fp_sum + tp_sum)
     # initialize precision array over recall steps.
     q = np.zeros((len(R),))
 
     # numpy is slow without cython optimization for accessing elements
     # use python array gets significant speed improvement
     pr = pr.tolist()
     q = q.tolist()
     for i in range(nd - 1, 0, -1):
         if pr[i] > pr[i - 1]:
             pr[i - 1] = pr[i]
 
     #discretize empiric recall steps with given bins.
     inds = np.searchsorted(rc, R, side='left')
     try:
         for ri, pi in enumerate(inds):
             q[ri] = pr[pi]
     except:
         pass
 
     return np.mean(q)
\ No newline at end of file
diff --git a/exec.py b/exec.py
index 932f9ff..2eb4337 100644
--- a/exec.py
+++ b/exec.py
@@ -1,254 +1,264 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """execution script."""
 
 import argparse
-import os
+import os, warnings
 import time
+
 import torch
 
 import utils.exp_utils as utils
 from evaluator import Evaluator
 from predictor import Predictor
 from plotting import plot_batch_prediction
 
+for msg in ["Attempting to set identical bottom==top results",
+            "This figure includes Axes that are not compatible with tight_layout",
+            "Data has no positive values, and therefore cannot be log-scaled.",
+            ".*invalid value encountered in double_scalars.*",
+            ".*Mean of empty slice.*"]:
+    warnings.filterwarnings("ignore", msg)
+
 
 def train(logger):
     """
     perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
     specified in the configs.
     """
     logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format(
         cf.dim, cf.fold, cf.exp_dir, cf.model))
 
     net = model.net(cf, logger).cuda()
     optimizer = torch.optim.Adam(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay)
     if cf.dynamic_lr_scheduling:
         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor,
                                                                patience=cf.scheduling_patience)
 
     model_selector = utils.ModelSelector(cf, logger)
     train_evaluator = Evaluator(cf, logger, mode='train')
     val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)
 
     starting_epoch = 1
 
     # prepare monitoring
     monitor_metrics = utils.prepare_monitoring(cf)
 
     if cf.resume_to_checkpoint:
         starting_epoch, monitor_metrics = utils.load_checkpoint(cf.resume_to_checkpoint, net, optimizer)
         logger.info('resumed to checkpoint {} at epoch {}'.format(cf.resume_to_checkpoint, starting_epoch))
 
     logger.info('loading dataset and initializing batch generators...')
     batch_gen = data_loader.get_train_generators(cf, logger)
 
     for epoch in range(starting_epoch, cf.num_epochs + 1):
 
         logger.info('starting training epoch {}'.format(epoch))
         start_time = time.time()
 
         net.train()
         train_results_list = []
 
         for bix in range(cf.num_train_batches):
             batch = next(batch_gen['train'])
             tic_fw = time.time()
             results_dict = net.train_forward(batch)
             tic_bw = time.time()
             optimizer.zero_grad()
             results_dict['torch_loss'].backward()
             optimizer.step()
             logger.info('tr. batch {0}/{1} (ep. {2}) fw {3:.2f}s / bw {4:.2f} s / total {5:.2f} s || '
                         .format(bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw,
                                 time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'])
-            train_results_list.append([results_dict['boxes'], batch['pid']])
+            #train_results_list.append([results_dict['boxes'], batch['pid']])
+            train_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"]))
 
         _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(train_results_list, monitor_metrics['train'])
-        #import IPython; IPython.embed()
+
         train_time = time.time() - start_time
 
         logger.info('starting validation in mode {}.'.format(cf.val_mode))
         with torch.no_grad():
             net.eval()
             if cf.do_validation:
                 val_results_list = []
                 val_predictor = Predictor(cf, net, logger, mode='val')
                 for _ in range(batch_gen['n_val']):
                     batch = next(batch_gen[cf.val_mode])
                     if cf.val_mode == 'val_patient':
                         results_dict = val_predictor.predict_patient(batch)
                     elif cf.val_mode == 'val_sampling':
                         results_dict = net.train_forward(batch, is_validation=True)
-                    val_results_list.append([results_dict['boxes'], batch['pid']])
+                    #val_results_list.append([results_dict['boxes'], batch['pid']])
+                    val_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"]))
 
                 _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(val_results_list, monitor_metrics['val'])
                 model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch)
 
             # update monitoring and prediction plots
             monitor_metrics.update({"lr":
                                         {str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups)}})
             logger.metrics2tboard(monitor_metrics, global_step=epoch)
 
             epoch_time = time.time() - start_time
             logger.info('trained epoch {}: took {:.2f} s ({:.2f} s train / {:.2f} s val)'.format(
                 epoch, epoch_time, train_time, epoch_time-train_time))
             batch = next(batch_gen['val_sampling'])
             results_dict = net.train_forward(batch, is_validation=True)
             logger.info('plotting predictions from validation sampling.')
             plot_batch_prediction(batch, results_dict, cf)
 
         # -------------- scheduling -----------------
         if cf.dynamic_lr_scheduling:
             scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
         else:
             for param_group in optimizer.param_groups:
                 param_group['lr'] = cf.learning_rate[epoch-1]
 
 def test(logger):
     """
     perform testing for a given fold (or hold out set). save stats in evaluator.
     """
     logger.info('starting testing model of fold {} in exp {}'.format(cf.fold, cf.exp_dir))
     net = model.net(cf, logger).cuda()
     test_predictor = Predictor(cf, net, logger, mode='test')
     test_evaluator = Evaluator(cf, logger, mode='test')
     batch_gen = data_loader.get_test_generator(cf, logger)
     test_results_list = test_predictor.predict_test_set(batch_gen, return_results=True)
     test_evaluator.evaluate_predictions(test_results_list)
     test_evaluator.score_test_df()
 
 
 if __name__ == '__main__':
     stime = time.time()
 
     parser = argparse.ArgumentParser()
     parser.add_argument('-m', '--mode', type=str,  default='train_test',
                         help='one out of: train / test / train_test / analysis / create_exp')
     parser.add_argument('-f','--folds', nargs='+', type=int, default=None,
                         help='None runs over all folds in CV. otherwise specify list of folds.')
     parser.add_argument('--exp_dir', type=str, default='/path/to/experiment/directory',
                         help='path to experiment dir. will be created if non existent.')
     parser.add_argument('--server_env', default=False, action='store_true',
                         help='change IO settings to deploy models on a cluster.')
     parser.add_argument('--data_dest', type=str, default=None, help="path to final data folder if different from config.")
     parser.add_argument('--use_stored_settings', default=False, action='store_true',
                         help='load configs from existing exp_dir instead of source dir. always done for testing, '
                              'but can be set to true to do the same for training. useful in job scheduler environment, '
                              'where source code might change before the job actually runs.')
     parser.add_argument('--resume_to_checkpoint', type=str, default=None,
                         help='if resuming to checkpoint, the desired fold still needs to be parsed via --folds.')
     parser.add_argument('--exp_source', type=str, default='experiments/toy_exp',
                         help='specifies, from which source experiment to load configs and data_loader.')
     parser.add_argument('-d', '--dev', default=False, action='store_true', help="development mode: shorten everything")
 
     args = parser.parse_args()
     folds = args.folds
     resume_to_checkpoint = args.resume_to_checkpoint
 
     if args.mode == 'train' or args.mode == 'train_test':
 
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, args.use_stored_settings)
         if args.dev:
             folds = [0,1]
             cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 1, 0, 1
             cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 5, 1, 1
             cf.test_n_epochs =  cf.save_n_models
             cf.max_test_patients = 1
 
         cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
         for fold in folds:
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
             cf.fold = fold
             cf.resume_to_checkpoint = resume_to_checkpoint
             if not os.path.exists(cf.fold_dir):
                 os.mkdir(cf.fold_dir)
             logger.set_logfile(fold=fold)
             train(logger)
             cf.resume_to_checkpoint = None
             if args.mode == 'train_test':
                 test(logger)
 
     elif args.mode == 'test':
 
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True)
         if args.dev:
             folds = [0,1]
             cf.test_n_epochs =  1; cf.max_test_patients = 1
 
         cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
         for fold in folds:
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
             cf.fold = fold
             logger.set_logfile(fold=fold)
             test(logger)
 
 
     # load raw predictions saved by predictor during testing, run aggregation algorithms and evaluation.
     elif args.mode == 'analysis':
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True)
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
 
         if cf.hold_out_test_set:
             cf.folds = args.folds
             predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
             results_list = predictor.load_saved_predictions(apply_wbc=True)
-            utils.create_csv_output(results_list, cf, logger)
+            utils.create_csv_output([(res_dict["boxes"], pid) for res_dict, pid in results_list], cf, logger)
 
         else:
             if folds is None:
                 folds = range(cf.n_cv_splits)
             for fold in folds:
                 cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
                 cf.fold = fold
                 logger.set_logfile(fold=fold)
                 predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
                 results_list = predictor.load_saved_predictions(apply_wbc=True)
                 logger.info('starting evaluation...')
                 evaluator = Evaluator(cf, logger, mode='test')
                 evaluator.evaluate_predictions(results_list)
                 evaluator.score_test_df()
 
     # create experiment folder and copy scripts without starting job.
     # useful for cloud deployment where configs might change before job actually runs.
     elif args.mode == 'create_exp':
         cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, use_stored_settings=False)
         logger = utils.get_logger(cf.exp_dir)
         logger.info('created experiment directory at {}'.format(cf.exp_dir))
 
     else:
         raise RuntimeError('mode specified in args is not implemented...')
 
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t))
     del logger
\ No newline at end of file
diff --git a/experiments/toy_exp/configs.py b/experiments/toy_exp/configs.py
index 6d4774a..f37f262 100644
--- a/experiments/toy_exp/configs.py
+++ b/experiments/toy_exp/configs.py
@@ -1,350 +1,350 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 import numpy as np
 from default_configs import DefaultConfigs
 
 class configs(DefaultConfigs):
 
     def __init__(self, server_env=None):
 
         #########################
         #    Preprocessing      #
         #########################
 
         self.root_dir = '/home/gregor/datasets/toy_mdt'
 
         #########################
         #         I/O           #
         #########################
 
 
         # one out of [2, 3]. dimension the model operates in.
         self.dim = 2
 
         # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn'].
-        self.model = 'detection_unet'
+        self.model = 'retina_net'
 
         DefaultConfigs.__init__(self, self.model, server_env, self.dim)
 
         # int [0 < dataset_size]. select n patients from dataset for prototyping.
         self.select_prototype_subset = None
         self.hold_out_test_set = True
         self.n_train_data = 2500
 
         # choose one of the 3 toy experiments described in https://arxiv.org/pdf/1811.08661.pdf
         # one of ['donuts_shape', 'donuts_pattern', 'circles_scale'].
         toy_mode = 'donuts_shape_noise'
 
         # path to preprocessed data.
         self.input_df_name = 'info_df.pickle'
         self.pp_name = os.path.join(toy_mode, 'train')
         self.pp_data_path = os.path.join(self.root_dir, self.pp_name)
         self.pp_test_name = os.path.join(toy_mode, 'test')
         self.pp_test_data_path = os.path.join(self.root_dir, self.pp_test_name)
 
         # settings for deployment in cloud.
         if server_env:
             # path to preprocessed data.
             pp_root_dir = '/datasets/datasets_ramien/toy_exp/data'
             self.pp_name = os.path.join(toy_mode, 'train')
             self.pp_data_path = os.path.join(pp_root_dir, self.pp_name)
             self.pp_test_name = os.path.join(toy_mode, 'test')
             self.pp_test_data_path = os.path.join(pp_root_dir, self.pp_test_name)
             self.select_prototype_subset = None
 
         #########################
         #      Data Loader      #
         #########################
 
         # select modalities from preprocessed data
         self.channels = [0]
         self.n_channels = len(self.channels)
 
         # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation.
         self.pre_crop_size_2D = [320, 320]
         self.patch_size_2D = [320, 320]
 
         self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D
         self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D
 
         # ratio of free sampled batch elements before class balancing is triggered
         # (>0 to include "empty"/background patches.)
         self.batch_sample_slack = 0.2
 
         # set 2D network to operate in 3D images.
         self.merge_2D_to_3D_preds = False
 
         # feed +/- n neighbouring slices into channel dimension. set to None for no context.
         self.n_3D_context = None
         if self.n_3D_context is not None and self.dim == 2:
             self.n_channels *= (self.n_3D_context * 2 + 1)
 
 
         #########################
         #      Architecture      #
         #########################
 
         self.start_filts = 48 if self.dim == 2 else 18
         self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2
         self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50'
         self.norm = None # one of None, 'instance_norm', 'batch_norm'
         self.weight_decay = 0
 
         # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform')
         self.weight_init = None
 
         #########################
         #  Schedule / Selection #
         #########################
 
-        self.num_epochs = 24
+        self.num_epochs = 22
         self.num_train_batches = 100 if self.dim == 2 else 200
         self.batch_size = 20 if self.dim == 2 else 8
 
         self.do_validation = True
         # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training)
         # the former is morge accurate, while the latter is faster (depending on volume size)
         self.val_mode = 'val_patient' # one of 'val_sampling' , 'val_patient'
         if self.val_mode == 'val_patient':
             self.max_val_patients = None  # if 'None' iterates over entire val_set once.
         if self.val_mode == 'val_sampling':
             self.num_val_batches = 50
 
         # set dynamic_lr_scheduling to True to apply LR scheduling with below settings.
         self.dynamic_lr_scheduling = True
         self.lr_decay_factor = 0.5
         self.scheduling_patience = int(self.num_train_batches * self.batch_size / 2400)
         self.scheduling_criterion = 'malignant_ap'
         self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max'
 
         #########################
         #   Testing / Plotting  #
         #########################
 
         # set the top-n-epochs to be saved for temporal averaging in testing.
         self.save_n_models = 5
         self.test_n_epochs = 5
 
         # set a minimum epoch number for saving in case of instabilities in the first phase of training.
         self.min_save_thresh = 0 if self.dim == 2 else 0
 
         self.report_score_level = ['patient', 'rois']  # choose list from 'patient', 'rois'
         self.class_dict = {1: 'benign', 2: 'malignant'}  # 0 is background.
         self.patient_class_of_interest = 2  # patient metrics are only plotted for one class.
         self.ap_match_ious = [0.1]  # list of ious to be evaluated for ap-scoring.
 
         self.model_selection_criteria = ['benign_ap', 'malignant_ap'] # criteria to average over for saving epochs.
         self.min_det_thresh = 0.1  # minimum confidence value to select predictions for evaluation.
 
         # threshold for clustering predictions together (wcs = weighted cluster scoring).
         # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold).
         # if too high, preds of the same object are separate clusters.
         self.wcs_iou = 1e-5
 
         self.plot_prediction_histograms = True
         self.plot_stat_curves = False
 
         #########################
         #   Data Augmentation   #
         #########################
 
         self.da_kwargs={
         'do_elastic_deform': True,
         'alpha':(0., 1500.),
         'sigma':(30., 50.),
         'do_rotation':True,
         'angle_x': (0., 2 * np.pi),
         'angle_y': (0., 0),
         'angle_z': (0., 0),
         'do_scale': True,
         'scale':(0.8, 1.1),
         'random_crop':False,
         'rand_crop_dist':  (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3),
         'border_mode_data': 'constant',
         'border_cval_data': 0,
         'order_data': 1
         }
 
         if self.dim == 3:
             self.da_kwargs['do_elastic_deform'] = False
             self.da_kwargs['angle_x'] = (0, 0.0)
             self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!!
             self.da_kwargs['angle_z'] = (0., 2 * np.pi)
 
 
         #########################
         #   Add model specifics #
         #########################
 
         {'detection_unet': self.add_det_unet_configs,
          'mrcnn': self.add_mrcnn_configs,
          'ufrcnn': self.add_mrcnn_configs,
          'ufrcnn_surrounding': self.add_mrcnn_configs,
          'retina_net': self.add_mrcnn_configs,
          'retina_unet': self.add_mrcnn_configs,
          'prob_detector': self.add_mrcnn_configs,
         }[self.model]()
 
 
     def add_det_unet_configs(self):
 
         self.learning_rate = [1e-4] * self.num_epochs
 
         # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median']
         self.aggregation_operation = 'max'
 
         # max number of roi candidates to identify per image (slice in 2D, volume in 3D)
         self.n_roi_candidates = 3 if self.dim == 2 else 8
 
         # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
         self.seg_loss_mode = 'dice_wce'
 
         # if <1, false positive predictions in foreground are penalized less.
         self.fp_dice_weight = 1 if self.dim == 2 else 1
 
         self.wce_weights = [1, 1, 1]
         self.detection_min_confidence = self.min_det_thresh
 
         # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic).
         self.class_specific_seg_flag = True
         self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
         self.head_classes = self.num_seg_classes
 
     def add_mrcnn_configs(self):
 
         # learning rate is a list with one entry per epoch.
         self.learning_rate = [1e-4] * self.num_epochs
 
         # disable mask head loss. (e.g. if no pixelwise annotations available)
         self.frcnn_mode = False
 
         # disable the re-sampling of mask proposals to original size for speed-up.
         # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching),
         # mask-outputs are optional.
         self.return_masks_in_val = True
         self.return_masks_in_test = False
 
         # set number of proposal boxes to plot after each epoch.
         self.n_plot_rpn_props = 0 if self.dim == 2 else 0
 
         # number of classes for head networks: n_foreground_classes + 1 (background)
         self.head_classes = 3
 
         # seg_classes hier refers to the first stage classifier (RPN)
         self.num_seg_classes = 2  # foreground vs. background
 
         # feature map strides per pyramid level are inferred from architecture.
         self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]}
 
         # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale
         # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.)
         self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]}
 
         # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3.
         self.pyramid_levels = [0, 1, 2, 3]
 
         # number of feature maps in rpn. typically lowered in 3D to save gpu-memory.
         self.n_rpn_features = 512 if self.dim == 2 else 128
 
         # anchor ratios and strides per position in feature maps.
         self.rpn_anchor_ratios = [0.5, 1., 2.]
         self.rpn_anchor_stride = 1
 
         # Threshold for first stage (RPN) non-maximum suppression (NMS):  LOWER == HARDER SELECTION
         self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7
 
         # loss sampling settings.
         self.rpn_train_anchors_per_image = 2  #per batch element
         self.train_rois_per_image = 2 #per batch element
         self.roi_positive_ratio = 0.5
         self.anchor_matching_iou = 0.7
 
         # factor of top-k candidates to draw from  per negative sample (stochastic-hard-example-mining).
         # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples.
         self.shem_poolsize = 10
 
         self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3)
         self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5)
         self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10)
 
         self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1]])
         self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1]])
 
         if self.dim == 2:
             self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4]
             self.bbox_std_dev = self.bbox_std_dev[:4]
             self.window = self.window[:4]
             self.scale = self.scale[:4]
 
         # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element.
         self.pre_nms_limit = 3000 if self.dim == 2 else 6000
 
         # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True,
         # since proposals of the entire batch are forwarded through second stage in as one "batch".
         self.roi_chunk_size = 800 if self.dim == 2 else 600
         self.post_nms_rois_training = 500 if self.dim == 2 else 75
         self.post_nms_rois_inference = 500
 
         # Final selection of detections (refine_detections)
         self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30  # per batch element and class.
         self.detection_nms_threshold = 1e-5  # needs to be > 0, otherwise all predictions are one cluster.
         self.model_min_confidence = 0.1
 
         if self.dim == 2:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride))]
                  for stride in self.backbone_strides['xy']])
         else:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride)),
                   int(np.ceil(self.patch_size[2] / stride_z))]
                  for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z']
                                              )])
         if self.model == 'ufrcnn':
             self.operate_stride1 = True
             self.class_specific_seg_flag = True
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
             self.frcnn_mode = True
 
         if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector':
             # implement extra anchor-scales according to retina-net publication.
             self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                             self.rpn_anchor_scales['xy']]
             self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                            self.rpn_anchor_scales['z']]
             self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3
 
             self.n_rpn_features = 256 if self.dim == 2 else 64
 
             # pre-selection of detections for NMS-speedup. per entire batch.
             self.pre_nms_limit = 10000 if self.dim == 2 else 50000
 
             # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002
             self.anchor_matching_iou = 0.5
 
             # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic).
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
 
             if self.model == 'retina_unet':
                 self.operate_stride1 = True
diff --git a/models/retina_net.py b/models/retina_net.py
index debff99..52435f5 100644
--- a/models/retina_net.py
+++ b/models/retina_net.py
@@ -1,504 +1,513 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Retina Net. According to https://arxiv.org/abs/1708.02002
 Retina U-Net. According to https://arxiv.org/abs/1811.08661
 """
 
 import utils.model_utils as mutils
 import utils.exp_utils as utils
 import sys
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils
 
 sys.path.append('..')
 from custom_extensions.nms import nms
 
 
 ############################################################
 #  Network Heads
 ############################################################
 
 class Classifier(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the classifier sub-network.
         """
         super(Classifier, self).__init__()
         self.dim = conv.dim
         self.n_classes = cf.head_classes
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * cf.head_classes
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride, pad=1, relu=None)
 
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: class_logits (b, n_anchors, n_classes)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         class_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         class_logits = class_logits.permute(*axes)
         class_logits = class_logits.contiguous()
         class_logits = class_logits.view(x.size()[0], -1, self.n_classes)
 
         return [class_logits]
 
 
 
 class BBRegressor(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the bb-regression sub-network.
         """
         super(BBRegressor, self).__init__()
         self.dim = conv.dim
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * self.dim * 2
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride,
                                pad=1, relu=None)
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: bb_logits (b, n_anchors, dim * 2)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         bb_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         bb_logits = bb_logits.permute(*axes)
         bb_logits = bb_logits.contiguous()
         bb_logits = bb_logits.view(x.size()[0], -1, self.dim * 2)
 
         return [bb_logits]
 
 
 ############################################################
 #  Loss Functions
 ############################################################
 
 def compute_class_loss(anchor_matches, class_pred_logits, shem_poolsize=20):
     """
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :param class_pred_logits: (n_anchors, n_classes). logits from classifier sub-network.
     :param shem_poolsize: int. factor of top-k candidates to draw from per negative sample (online-hard-example-mining).
     :return: loss: torch tensor.
     :return: np_neg_ix: 1D array containing indices of the neg_roi_logits, which have been sampled for training.
     """
     # Positive and Negative anchors contribute to the loss,
     # but neutral anchors (match value = 0) don't.
     pos_indices = torch.nonzero(anchor_matches > 0)
     neg_indices = torch.nonzero(anchor_matches == -1)
 
     # get positive samples and calucalte loss.
     if 0 not in pos_indices.size():
         pos_indices = pos_indices.squeeze(1)
         roi_logits_pos = class_pred_logits[pos_indices]
         targets_pos = anchor_matches[pos_indices]
         pos_loss = F.cross_entropy(roi_logits_pos, targets_pos.long())
     else:
         pos_loss = torch.FloatTensor([0]).cuda()
 
     # get negative samples, such that the amount matches the number of positive samples, but at least 1.
     # get high scoring negatives by applying online-hard-example-mining.
     if 0 not in neg_indices.size():
         neg_indices = neg_indices.squeeze(1)
         roi_logits_neg = class_pred_logits[neg_indices]
         negative_count = np.max((1, pos_indices.size()[0]))
         roi_probs_neg = F.softmax(roi_logits_neg, dim=1)
         neg_ix = mutils.shem(roi_probs_neg, negative_count, shem_poolsize)
         neg_loss = F.cross_entropy(roi_logits_neg[neg_ix], torch.LongTensor([0] * neg_ix.shape[0]).cuda())
         # return the indices of negative samples, which contributed to the loss (for monitoring plots).
         np_neg_ix = neg_ix.cpu().data.numpy()
     else:
         neg_loss = torch.FloatTensor([0]).cuda()
         np_neg_ix = np.array([]).astype('int32')
 
     loss = (pos_loss + neg_loss) / 2
     return loss, np_neg_ix
 
 
 def compute_bbox_loss(target_deltas, pred_deltas, anchor_matches):
     """
     :param target_deltas:   (b, n_positive_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd)))).
     Uses 0 padding to fill in unsed bbox deltas.
     :param pred_deltas: predicted deltas from bbox regression head. (b, n_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd))))
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(anchor_matches > 0).size():
 
         indices = torch.nonzero(anchor_matches > 0).squeeze(1)
         # Pick bbox deltas that contribute to the loss
         pred_deltas = pred_deltas[indices]
         # Trim target bounding box deltas to the same length as pred_deltas.
         target_deltas = target_deltas[:pred_deltas.size()[0], :]
         # Smooth L1 loss
         loss = F.smooth_l1_loss(pred_deltas, target_deltas)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 ############################################################
 #  Output Handler
 ############################################################
 
 def refine_detections(anchors, probs, deltas, batch_ixs, cf):
     """
     Refine classified proposals, filter overlaps and return final
     detections. n_proposals here is typically a very large number: batch_size * n_anchors.
     This function is hence optimized on trimming down n_proposals.
     :param anchors: (n_anchors, 2 * dim)
     :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by classifier head.
     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by bbox regressor head.
     :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
     :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
     """
     anchors = anchors.repeat(batch_ixs.unique().shape[0], 1)
 
     # flatten foreground probabilities, sort and trim down to highest confidences by pre_nms limit.
     fg_probs = probs[:, 1:].contiguous()
     flat_probs, flat_probs_order = fg_probs.view(-1).sort(descending=True)
     keep_ix = flat_probs_order[:cf.pre_nms_limit]
     # reshape indices to 2D index array with shape like fg_probs.
     keep_arr = torch.cat(((keep_ix / fg_probs.shape[1]).unsqueeze(1), (keep_ix % fg_probs.shape[1]).unsqueeze(1)), 1)
 
     pre_nms_scores = flat_probs[:cf.pre_nms_limit]
     pre_nms_class_ids = keep_arr[:, 1] + 1  # add background again.
     pre_nms_batch_ixs = batch_ixs[keep_arr[:, 0]]
     pre_nms_anchors = anchors[keep_arr[:, 0]]
     pre_nms_deltas = deltas[keep_arr[:, 0]]
     keep = torch.arange(pre_nms_scores.size()[0]).long().cuda()
 
     # apply bounding box deltas. re-scale to image coordinates.
     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
     scale = torch.from_numpy(cf.scale).float().cuda()
     refined_rois = mutils.apply_box_deltas_2D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale \
         if cf.dim == 2 else mutils.apply_box_deltas_3D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale
 
     # round and cast to int since we're deadling with pixels now
     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
     pre_nms_rois = torch.round(refined_rois)
     for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
 
         bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
         bix_class_ids = pre_nms_class_ids[bixs]
         bix_rois = pre_nms_rois[bixs]
         bix_scores = pre_nms_scores[bixs]
 
         for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
 
             ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
             # nms expects boxes sorted by score.
             ix_rois = bix_rois[ixs]
             ix_scores = bix_scores[ixs]
             ix_scores, order = ix_scores.sort(descending=True)
             ix_rois = ix_rois[order, :]
             ix_scores = ix_scores
 
             class_keep = nms.nms(ix_rois, ix_scores, cf.detection_nms_threshold)
 
             # map indices back.
             class_keep = keep[bixs[ixs[order[class_keep]]]]
             # merge indices over classes for current batch element
             b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
 
         # only keep top-k boxes of current batch-element.
         top_ids = pre_nms_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
         b_keep = b_keep[top_ids]
         # merge indices over batch elements.
         batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
 
     keep = batch_keep
 
     # arrange output.
     result = torch.cat((pre_nms_rois[keep],
                         pre_nms_batch_ixs[keep].unsqueeze(1).float(),
                         pre_nms_class_ids[keep].unsqueeze(1).float(),
                         pre_nms_scores[keep].unsqueeze(1)), dim=1)
 
     return result
 
 
 
 def get_results(cf, img_shape, detections, seg_logits, box_results_list=None):
     """
     Restores batch dimension of merged detections, unmolds detections, creates and fills results dict.
     :param img_shape:
     :param detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
     :param box_results_list: None or list of output boxes for monitoring/plotting.
     each element is a list of boxes per batch element.
     :return: results_dict: dictionary with keys:
              'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                       [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
              'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                           retina_unet and dummy array for retina_net.
     """
     detections = detections.cpu().data.numpy()
     batch_ixs = detections[:, cf.dim*2]
     detections = [detections[batch_ixs == ix] for ix in range(img_shape[0])]
 
     # for test_forward, where no previous list exists.
     if box_results_list is None:
         box_results_list = [[] for _ in range(img_shape[0])]
 
     for ix in range(img_shape[0]):
 
         if 0 not in detections[ix].shape:
 
             boxes = detections[ix][:, :2 * cf.dim].astype(np.int32)
             class_ids = detections[ix][:, 2 * cf.dim + 1].astype(np.int32)
             scores = detections[ix][:, 2 * cf.dim + 2]
 
             # Filter out detections with zero area. Often only happens in early
             # stages of training when the network weights are still a bit random.
             if cf.dim == 2:
                 exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
             else:
                 exclude_ix = np.where(
                     (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 5] - boxes[:, 4]) <= 0)[0]
 
             if exclude_ix.shape[0] > 0:
                 boxes = np.delete(boxes, exclude_ix, axis=0)
                 class_ids = np.delete(class_ids, exclude_ix, axis=0)
                 scores = np.delete(scores, exclude_ix, axis=0)
 
             if 0 not in boxes.shape:
                 for ix2, score in enumerate(scores):
                     if score >= cf.model_min_confidence:
                         box_results_list[ix].append({'box_coords': boxes[ix2],
                                                      'box_score': score,
                                                      'box_type': 'det',
                                                      'box_pred_class_id': class_ids[ix2]})
 
     results_dict = {'boxes': box_results_list}
     if seg_logits is None:
         # output dummy segmentation for retina_net.
         results_dict['seg_preds'] = np.zeros(img_shape)[:, 0][:, np.newaxis]
     else:
         # output label maps for retina_unet.
         results_dict['seg_preds'] = F.softmax(seg_logits, 1).argmax(1).cpu().data.numpy()[:, np.newaxis].astype('uint8')
 
     return results_dict
 
 
 ############################################################
 #  Retina (U-)Net Class
 ############################################################
 
 
 class net(nn.Module):
 
 
     def __init__(self, cf, logger):
 
         super(net, self).__init__()
         self.cf = cf
         self.logger = logger
         self.build()
         if self.cf.weight_init is not None:
             logger.info("using pytorch weight init of type {}".format(self.cf.weight_init))
             mutils.initialize_weights(self)
         else:
             logger.info("using default pytorch weight init")
 
     def build(self):
         """
         Build Retina Net architecture.
         """
 
         # Image size must be dividable by 2 multiple times.
         h, w = self.cf.patch_size[:2]
         if h / 2 ** 5 != int(h / 2 ** 5) or w / 2 ** 5 != int(w / 2 ** 5):
             raise Exception("Image size must be dividable by 2 at least 5 times "
                             "to avoid fractions when downscaling and upscaling."
                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
 
         # instanciate abstract multi dimensional conv class and backbone model.
         conv = mutils.NDConvGenerator(self.cf.dim)
         backbone = utils.import_module('bbone', self.cf.backbone_path)
 
         # build Anchors, FPN, Classifier / Bbox-Regressor -head
         self.np_anchors = mutils.generate_pyramid_anchors(self.logger, self.cf)
         self.anchors = torch.from_numpy(self.np_anchors).float().cuda()
         self.Fpn = backbone.FPN(self.cf, conv, operate_stride1=self.cf.operate_stride1)
         self.Classifier = Classifier(self.cf, conv)
         self.BBRegressor = BBRegressor(self.cf, conv)
 
 
     def train_forward(self, batch, **kwargs):
         """
         train method (also used for validation monitoring). wrapper around forward pass of network. prepares input data
         for processing, computes losses, and stores outputs in a dictionary.
         :param batch: dictionary containing 'data', 'seg', etc.
         :return: results_dict: dictionary with keys:
                 'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                         [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                 'seg_preds': pixelwise segmentation output (b, c, y, x, (z)) with values [0, .., n_classes].
                 'monitor_values': dict of values to be monitored.
         """
         img = batch['data']
         gt_class_ids = batch['roi_labels']
         gt_boxes = batch['bb_target']
 
         img = torch.from_numpy(img).float().cuda()
         batch_class_loss = torch.FloatTensor([0]).cuda()
         batch_bbox_loss = torch.FloatTensor([0]).cuda()
 
         # list of output boxes for monitoring/plotting. each element is a list of boxes per batch element.
         box_results_list = [[] for _ in range(img.shape[0])]
         detections, class_logits, pred_deltas, seg_logits = self.forward(img)
 
+
+
         # loop over batch
         for b in range(img.shape[0]):
 
             # add gt boxes to results dict for monitoring.
             if len(gt_boxes[b]) > 0:
                 for ix in range(len(gt_boxes[b])):
                     box_results_list[b].append({'box_coords': batch['bb_target'][b][ix],
                                                 'box_label': batch['roi_labels'][b][ix], 'box_type': 'gt'})
 
                 # match gt boxes with anchors to generate targets.
                 anchor_class_match, anchor_target_deltas = mutils.gt_anchor_matching(
                     self.cf, self.np_anchors, gt_boxes[b], gt_class_ids[b])
 
                 # add positive anchors used for loss to results_dict for monitoring.
                 pos_anchors = mutils.clip_boxes_numpy(
                     self.np_anchors[np.argwhere(anchor_class_match > 0)][:, 0], img.shape[2:])
                 for p in pos_anchors:
                     box_results_list[b].append({'box_coords': p, 'box_type': 'pos_anchor'})
 
             else:
                 anchor_class_match = np.array([-1]*self.np_anchors.shape[0])
                 anchor_target_deltas = np.array([0])
 
             anchor_class_match = torch.from_numpy(anchor_class_match).cuda()
             anchor_target_deltas = torch.from_numpy(anchor_target_deltas).float().cuda()
 
+            # todo debug print
+            pos_indices = torch.nonzero(anchor_class_match > 0).squeeze(0)
+            neg_indices = torch.nonzero(anchor_class_match == -1).squeeze(0)
+            softmax = F.softmax(class_logits[b][pos_indices].detach(), 1)
+            #ics = np.random.choice(range(softmax.shape[0]), size=min(softmax.shape[0], 6))
+            comb_view = torch.cat((anchor_class_match[pos_indices].detach().unsqueeze(1).float(), softmax), dim=1)
+            print(comb_view)
             # compute losses.
             class_loss, neg_anchor_ix = compute_class_loss(anchor_class_match, class_logits[b])
             bbox_loss = compute_bbox_loss(anchor_target_deltas, pred_deltas[b], anchor_class_match)
 
             # add negative anchors used for loss to results_dict for monitoring.
             neg_anchors = mutils.clip_boxes_numpy(
                 self.np_anchors[np.argwhere(anchor_class_match.cpu().numpy() == -1)][neg_anchor_ix, 0],
                 img.shape[2:])
             for n in neg_anchors:
                 box_results_list[b].append({'box_coords': n, 'box_type': 'neg_anchor'})
 
             batch_class_loss += class_loss / img.shape[0]
             batch_bbox_loss += bbox_loss / img.shape[0]
 
         results_dict = get_results(self.cf, img.shape, detections, seg_logits, box_results_list)
         loss = batch_class_loss + batch_bbox_loss
         results_dict['torch_loss'] = loss
-        results_dict['monitor_values'] = {'loss': loss.item(), 'class_loss': batch_class_loss.item()}
+        results_dict['class_loss'] = batch_class_loss.item()
         results_dict['logger_string'] = "loss: {0:.2f}, class: {1:.2f}, bbox: {2:.2f}"\
             .format(loss.item(), batch_class_loss.item(), batch_bbox_loss.item())
 
         return results_dict
 
 
     def test_forward(self, batch, **kwargs):
         """
         test method. wrapper around forward pass of network without usage of any ground truth information.
         prepares input data for processing and stores outputs in a dictionary.
         :param batch: dictionary containing 'data'
         :return: results_dict: dictionary with keys:
                'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                        [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                             retina_unet and dummy array for retina_net.
         """
         img = batch['data']
         img = torch.from_numpy(img).float().cuda()
         detections, _, _, seg_logits = self.forward(img)
         results_dict = get_results(self.cf, img.shape, detections, seg_logits)
         return results_dict
 
 
     def forward(self, img):
         """
         forward pass of the model.
         :param img: input img (b, c, y, x, (z)).
         :return: rpn_pred_logits: (b, n_anchors, 2)
         :return: rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
         :return: batch_proposal_boxes: (b, n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix)) only for monitoring/plotting.
         :return: detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
         :return: detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
         """
         # Feature extraction
         fpn_outs = self.Fpn(img)
         seg_logits = None
         selected_fmaps = [fpn_outs[i] for i in self.cf.pyramid_levels]
 
         # Loop through pyramid layers
         class_layer_outputs, bb_reg_layer_outputs = [], []  # list of lists
         for p in selected_fmaps:
             class_layer_outputs.append(self.Classifier(p))
             bb_reg_layer_outputs.append(self.BBRegressor(p))
 
         # Concatenate layer outputs
         # Convert from list of lists of level outputs to list of lists
         # of outputs across levels.
         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
         class_logits = list(zip(*class_layer_outputs))
         class_logits = [torch.cat(list(o), dim=1) for o in class_logits][0]
         bb_outputs = list(zip(*bb_reg_layer_outputs))
         bb_outputs = [torch.cat(list(o), dim=1) for o in bb_outputs][0]
 
         # merge batch_dimension and store info in batch_ixs for re-allocation.
         batch_ixs = torch.arange(class_logits.shape[0]).unsqueeze(1).repeat(1, class_logits.shape[1]).view(-1).cuda()
         flat_class_softmax = F.softmax(class_logits.view(-1, class_logits.shape[-1]), 1)
         flat_bb_outputs = bb_outputs.view(-1, bb_outputs.shape[-1])
         detections = refine_detections(self.anchors, flat_class_softmax, flat_bb_outputs, batch_ixs, self.cf)
 
         return detections, class_logits, bb_outputs, seg_logits
diff --git a/predictor.py b/predictor.py
index 96623ce..95fa872 100644
--- a/predictor.py
+++ b/predictor.py
@@ -1,819 +1,850 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os
 import numpy as np
 import torch
 from scipy.stats import norm
 from collections import OrderedDict
 from multiprocessing import Pool
 import pickle
 import pandas as pd
 
 
 class Predictor:
     """
     Prediction pipeline:
     - receives a patched patient image (n_patches, c, y, x, (z)) from patient data loader.
     - forwards patches through model in chunks of batch_size. (method: batch_tiling_forward)
     - unmolds predictions (boxes and segmentations) to original patient coordinates. (method: spatial_tiling_forward)
 
     Ensembling (mode == 'test'):
     - for inference, forwards 4 mirrored versions of image to through model and unmolds predictions afterwards
       accordingly (method: data_aug_forward)
     - for inference, loads multiple parameter-sets of the trained model corresponding to different epochs. for each
       parameter-set loops over entire test set, runs prediction pipeline for each patient. (method: predict_test_set)
 
     Consolidation of predictions:
     - consolidates a patient's predictions (boxes, segmentations) collected over patches, data_aug- and temporal ensembling,
       performs clustering and weighted averaging (external function: apply_wbc_to_patient) to obtain consistent outptus.
     - for 2D networks, consolidates box predictions to 3D cubes via clustering (adaption of non-maximum surpression).
       (external function: merge_2D_to_3D_preds_per_patient)
 
     Ground truth handling:
     - dissmisses any ground truth boxes returned by the model (happens in validation mode, patch-based groundtruth)
     - if provided by data loader, adds 3D ground truth to the final predictions to be passed to the evaluator.
     """
     def __init__(self, cf, net, logger, mode):
 
         self.cf = cf
         self.logger = logger
 
         # mode is 'val' for patient-based validation/monitoring and 'test' for inference.
         self.mode = mode
 
         # model instance. In validation mode, contains parameters of current epoch.
         self.net = net
 
         # rank of current epoch loaded (for temporal averaging). this info is added to each prediction,
         # for correct weighting during consolidation.
         self.rank_ix = '0'
 
         # number of ensembled models. used to calculate the number of expected predictions per position
         # during consolidation of predictions. Default is 1 (no ensembling, e.g. in validation).
         self.n_ens = 1
 
         if self.mode == 'test':
             try:
                 self.epoch_ranking = np.load(os.path.join(self.cf.fold_dir, 'epoch_ranking.npy'))[:cf.test_n_epochs]
             except:
                 raise RuntimeError('no epoch ranking file in fold directory. '
                                    'seems like you are trying to run testing without prior training...')
             self.n_ens = cf.test_n_epochs
             if self.cf.test_aug:
                 self.n_ens *= 4
 
 
     def predict_patient(self, batch):
         """
         predicts one patient.
         called either directly via loop over validation set in exec.py (mode=='val')
         or from self.predict_test_set (mode=='test).
         in val mode:  adds 3D ground truth info to predictions and runs consolidation and 2Dto3D merging of predictions.
         in test mode: returns raw predictions (ground truth addition, consolidation, 2D to 3D merging are
                       done in self.predict_test_set, because patient predictions across several epochs might be needed
                       to be collected first, in case of temporal ensembling).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
-                 - monitor_values (only in validation mode)
+                 - losses (only in validation mode)
         """
         self.logger.info('evaluating patient {} for fold {} '.format(batch['pid'], self.cf.fold))
 
         # True if patient is provided in patches and predictions need to be tiled.
         self.patched_patient = True if 'patch_crop_coords' in list(batch.keys()) else False
 
         # forward batch through prediction pipeline.
         results_dict = self.data_aug_forward(batch)
 
         if self.mode == 'val':
             for b in range(batch['patient_bb_target'].shape[0]):
                 for t in range(len(batch['patient_bb_target'][b])):
                     results_dict['boxes'][b].append({'box_coords': batch['patient_bb_target'][b][t],
                                                      'box_label': batch['patient_roi_labels'][b][t],
                                                      'box_type': 'gt'})
 
             if self.patched_patient:
                 wcs_input = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.wcs_iou, self.n_ens]
                 results_dict['boxes'] = apply_wbc_to_patient(wcs_input)[0]
 
             if self.cf.merge_2D_to_3D_preds:
                 merge_dims_inputs = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.merge_3D_iou]
                 results_dict['boxes'] = merge_2D_to_3D_preds_per_patient(merge_dims_inputs)[0]
 
         return results_dict
 
 
     def predict_test_set(self, batch_gen, return_results=True):
         """
         wrapper around test method, which loads multiple (or one) epoch parameters (temporal ensembling), loops through
         the test set and collects predictions per patient. Also flattens the results per patient and epoch
         and adds optional ground truth boxes for evaluation. Saves out the raw result list for later analysis and
         optionally consolidates and returns predictions immediately.
         :return: (optionally) list_of_results_per_patient: list over patient results. each entry is a dict with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation.
         """
         dict_of_patient_results = OrderedDict()
 
         # get paths of all parameter sets to be loaded for temporal ensembling. (or just one for no temp. ensembling).
         weight_paths = [os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch), 'params.pth') for epoch in
                         self.epoch_ranking]
 
         for rank_ix, weight_path in enumerate(weight_paths):
 
             self.logger.info(('tmp ensembling over rank_ix:{} epoch:{}'.format(rank_ix, weight_path)))
             self.net.load_state_dict(torch.load(weight_path))
             self.net.eval()
             self.rank_ix = str(rank_ix)  # get string of current rank for unique patch ids.
 
             with torch.no_grad():
                 for _ in range(batch_gen['n_test']):
 
                     batch = next(batch_gen['test'])
 
                     # store batch info in patient entry of results dict.
                     if rank_ix == 0:
                         dict_of_patient_results[batch['pid']] = {}
                         dict_of_patient_results[batch['pid']]['results_list'] = []
                         dict_of_patient_results[batch['pid']]['patient_bb_target'] = batch['patient_bb_target']
                         dict_of_patient_results[batch['pid']]['patient_roi_labels'] = batch['patient_roi_labels']
 
                     # call prediction pipeline and store results in dict.
                     results_dict = self.predict_patient(batch)
-                    dict_of_patient_results[batch['pid']]['results_list'].append(results_dict['boxes'])
+                    dict_of_patient_results[batch['pid']]['results_list'].append({"boxes": results_dict['boxes']})
+
 
 
         self.logger.info('finished predicting test set. starting post-processing of predictions.')
-        list_of_results_per_patient = []
+        results_per_patient = []
 
         # loop over patients again to flatten results across epoch predictions.
         # if provided, add ground truth boxes for evaluation.
         for pid, p_dict in dict_of_patient_results.items():
 
             tmp_ens_list = p_dict['results_list']
             results_dict = {}
             # collect all boxes/seg_preds of same batch_instance over temporal instances.
-            results_dict['boxes'] = [[item for d in tmp_ens_list for item in d[batch_instance]]
-                                     for batch_instance in range(len(tmp_ens_list[0]))]
+            b_size = len(tmp_ens_list[0])
+            results_dict['boxes'] = [[item for rank_dict in tmp_ens_list for item in rank_dict["boxes"][batch_instance]]
+                                     for batch_instance in range(b_size)]
 
             # TODO return for instance segmentation:
             # results_dict['seg_preds'] = np.mean(results_dict['seg_preds'], 1)[:, None]
             # results_dict['seg_preds'] = np.array([[item for d in tmp_ens_list for item in d['seg_preds'][batch_instance]]
             #                                       for batch_instance in range(len(tmp_ens_list[0]['boxes']))])
 
             # add 3D ground truth boxes for evaluation.
             for b in range(p_dict['patient_bb_target'].shape[0]):
                 for t in range(len(p_dict['patient_bb_target'][b])):
                     results_dict['boxes'][b].append({'box_coords': p_dict['patient_bb_target'][b][t],
                                                      'box_label': p_dict['patient_roi_labels'][b][t],
                                                      'box_type': 'gt'})
 
-            list_of_results_per_patient.append([results_dict['boxes'], pid])
+            results_per_patient.append([results_dict, pid])
 
         # save out raw predictions.
         out_string = 'raw_pred_boxes_hold_out_list' if self.cf.hold_out_test_set else 'raw_pred_boxes_list'
         with open(os.path.join(self.cf.fold_dir, '{}.pickle'.format(out_string)), 'wb') as handle:
-            pickle.dump(list_of_results_per_patient, handle)
+            pickle.dump(results_per_patient, handle)
 
         if return_results:
-
+            final_patient_box_results = [(res_dict["boxes"], pid) for res_dict, pid in results_per_patient]
             # consolidate predictions.
             self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format(
                 self.cf.wcs_iou, self.n_ens))
             pool = Pool(processes=6)
-            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, self.n_ens] for ii in list_of_results_per_patient]
-            list_of_results_per_patient = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
+            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, self.n_ens] for ii in final_patient_box_results]
+            final_patient_box_results = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
 
             # merge 2D boxes to 3D cubes. (if model predicts 2D but evaluation is run in 3D)
             if self.cf.merge_2D_to_3D_preds:
                 self.logger.info('applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
                 pool = Pool(processes=6)
-                mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in list_of_results_per_patient]
-                list_of_results_per_patient = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
+                mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in final_patient_box_results]
+                final_patient_box_results = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
                 pool.close()
                 pool.join()
 
-            return list_of_results_per_patient
+            # final_patient_box_results holds [avg_boxes, pid] if wbc
+            for ix in range(len(results_per_patient)):
+                assert results_per_patient[ix][1] == final_patient_box_results[ix][1], "should be same pid"
+                results_per_patient[ix][0]["boxes"] = final_patient_box_results[ix][0]
+
+            return results_per_patient
 
 
     def load_saved_predictions(self, apply_wbc=False):
         """
         loads raw predictions saved by self.predict_test_set. consolidates and merges 2D boxes to 3D cubes for evaluation.
         (if model predicts 2D but evaluation is run in 3D)
-        :return: (optionally) list_of_results_per_patient: list over patient results. each entry is a dict with keys:
+        :return: (optionally) results_list: list over patient results. each entry is a dict with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions
                             (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation.
         """
 
         # load predictions for a single test-set fold.
         if not self.cf.hold_out_test_set:
             with open(os.path.join(self.cf.fold_dir, 'raw_pred_boxes_list.pickle'), 'rb') as handle:
-                list_of_results_per_patient = pickle.load(handle)
+                results_list = pickle.load(handle)
+            box_results_list = [(res_dict["boxes"], pid) for res_dict, pid in results_list]
             da_factor = 4 if self.cf.test_aug else 1
             n_ens = self.cf.test_n_epochs * da_factor
             self.logger.info('loaded raw test set predictions with n_patients = {} and n_ens = {}'.format(
-                len(list_of_results_per_patient), n_ens))
+                len(results_list), n_ens))
 
         # if hold out test set was perdicted, aggregate predictions of all trained models
         # corresponding to all CV-folds and flatten them.
         else:
-            boxes_list = []
-            for fold in self.cf.folds:
+            self.logger.info("loading saved predictions of hold-out test set")
+            fold_dirs = sorted([os.path.join(self.cf.exp_dir, f) for f in os.listdir(self.cf.exp_dir) if
+                                os.path.isdir(os.path.join(self.cf.exp_dir, f)) and f.startswith("fold")])
+
+            results_list = []
+            folds_loaded = 0
+            for fold in range(self.cf.n_cv_splits):
                 fold_dir = os.path.join(self.cf.exp_dir, 'fold_{}'.format(fold))
-                with open(os.path.join(fold_dir, 'raw_pred_boxes_hold_out_list.pickle'), 'rb') as handle:
-                    fold_list = pickle.load(handle)
-                    pids = [ii[1] for ii in fold_list]
-                    boxes_list.append([ii[0] for ii in fold_list])
-            list_of_results_per_patient = [[[[box for fold_list in boxes_list for box in fold_list[pix][0]
-                                              if box['box_type'] == 'det']], pid] for pix, pid in enumerate(pids)]
+                if fold_dir in fold_dirs:
+                    with open(os.path.join(fold_dir, 'raw_pred_boxes_hold_out_list.pickle'), 'rb') as handle:
+                        fold_list = pickle.load(handle)
+                        results_list += fold_list
+                        folds_loaded += 1
+                else:
+                    self.logger.info("Skipping fold {} since no saved predictions found.".format(fold))
+            box_results_list = []
+            for res_dict, pid in results_list: #without filtering gt out:
+                box_results_list.append((res_dict['boxes'], pid))
+
             da_factor = 4 if self.cf.test_aug else 1
-            n_ens = self.cf.test_n_epochs * da_factor * len(self.cf.folds)
+            n_ens = self.cf.test_n_epochs * da_factor * folds_loaded
 
         # consolidate predictions.
         if apply_wbc:
             self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format(
                 self.cf.wcs_iou, n_ens))
             pool = Pool(processes=6)
-            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, n_ens] for ii in list_of_results_per_patient]
-            list_of_results_per_patient = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
+            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, n_ens] for ii in box_results_list]
+            box_results_list = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
-        else:
-            list_of_results_per_patient = list_of_results_per_patient
 
         # merge 2D box predictions to 3D cubes (if model predicts 2D but evaluation is run in 3D)
         if self.cf.merge_2D_to_3D_preds:
             self.logger.info(
                 'applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou))
             pool = Pool(processes=6)
-            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in list_of_results_per_patient]
-            list_of_results_per_patient = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
+            mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in box_results_list]
+            box_results_list = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1)
             pool.close()
             pool.join()
 
-        return list_of_results_per_patient
+
+        for ix in range(len(results_list)):
+            assert np.all(
+                results_list[ix][1] == box_results_list[ix][1]), "pid mismatch between loaded and aggregated results"
+            results_list[ix][0]["boxes"] = box_results_list[ix][0]
+
+        return results_list  # holds (results_dict, pid)
 
 
     def data_aug_forward(self, batch):
         """
         in val_mode: passes batch through to spatial_tiling method without data_aug.
         in test_mode: if cf.test_aug is set in configs, createst 4 mirrored versions of the input image,
         passes all of them to the next processing step (spatial_tiling method) and re-transforms returned predictions
         to original image version.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
-                 - monitor_values (only in validation mode)
+                 - losses (only in validation mode)
         """
         patch_crops = batch['patch_crop_coords'] if self.patched_patient else None
         results_list = [self.spatial_tiling_forward(batch, patch_crops)]
         org_img_shape = batch['original_img_shape']
 
         if self.mode == 'test' and self.cf.test_aug:
 
             if self.patched_patient:
                 # apply mirror transformations to patch-crop coordinates, for correct tiling in spatial_tiling method.
                 mirrored_patch_crops = get_mirrored_patch_crops(patch_crops, batch['original_img_shape'])
             else:
                 mirrored_patch_crops = [None] * 3
 
             img = np.copy(batch['data'])
 
             # first mirroring: y-axis.
             batch['data'] = np.flip(img, axis=2).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[0], n_aug='1')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2]
                     coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=2)
             results_list.append(chunk_dict)
 
             # second mirroring: x-axis.
             batch['data'] = np.flip(img, axis=3).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[1], n_aug='2')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3]
                     coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=3)
             results_list.append(chunk_dict)
 
             # third mirroring: y-axis and x-axis.
             batch['data'] = np.flip(np.flip(img, axis=2), axis=3).copy()
             chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[2], n_aug='3')
             # re-transform coordinates.
             for ix in range(len(chunk_dict['boxes'])):
                 for boxix in range(len(chunk_dict['boxes'][ix])):
                     coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy()
                     coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2]
                     coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0]
                     coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3]
                     coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1]
                     assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()]
                     chunk_dict['boxes'][ix][boxix]['box_coords'] = coords
             # re-transform segmentation predictions.
             chunk_dict['seg_preds'] = np.flip(np.flip(chunk_dict['seg_preds'], axis=2), axis=3).copy()
             results_list.append(chunk_dict)
 
             batch['data'] = img
 
         # aggregate all boxes/seg_preds per batch element from data_aug predictions.
         results_dict = {}
         results_dict['boxes'] = [[item for d in results_list for item in d['boxes'][batch_instance]]
                                  for batch_instance in range(org_img_shape[0])]
         results_dict['seg_preds'] = np.array([[item for d in results_list for item in d['seg_preds'][batch_instance]]
                                               for batch_instance in range(org_img_shape[0])])
         if self.mode == 'val':
-            results_dict['monitor_values'] = results_list[0]['monitor_values']
-
+            try:
+                results_dict['torch_loss'] = results_list[0]['torch_loss']
+                results_dict['class_loss'] = results_list[0]['class_loss']
+            except KeyError:
+                pass
         return results_dict
 
 
     def spatial_tiling_forward(self, batch, patch_crops=None, n_aug='0'):
         """
         forwards batch to batch_tiling_forward method and receives and returns a dictionary with results.
         if patch-based prediction, the results received from batch_tiling_forward will be on a per-patch-basis.
         this method uses the provided patch_crops to re-transform all predictions to whole-image coordinates.
         Patch-origin information of all box-predictions will be needed for consolidation, hence it is stored as
         'patch_id', which is a unique string for each patch (also takes current data aug and temporal epoch instances
         into account). all box predictions get additional information about the amount overlapping patches at the
         respective position (used for consolidation).
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
-                 - monitor_values (only in validation mode)
+                 - losses (only in validation mode)
         """
         if patch_crops is not None:
 
             patches_dict = self.batch_tiling_forward(batch)
 
             results_dict = {'boxes': [[] for _ in range(batch['original_img_shape'][0])]}
 
             # instanciate segemntation output array. Will contain averages over patch predictions.
             out_seg_preds = np.zeros(batch['original_img_shape'], dtype=np.float16)[:, 0][:, None]
             # counts patch instances per pixel-position.
             patch_overlap_map = np.zeros_like(out_seg_preds, dtype='uint8')
 
             #unmold segmentation outputs. loop over patches.
             for pix, pc in enumerate(patch_crops):
                 if self.cf.dim == 3:
                     out_seg_preds[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += patches_dict['seg_preds'][pix][None]
                     patch_overlap_map[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += 1
                 else:
                     out_seg_preds[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += patches_dict['seg_preds'][pix]
                     patch_overlap_map[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += 1
 
             # take mean in overlapping areas.
             out_seg_preds[patch_overlap_map > 0] /= patch_overlap_map[patch_overlap_map > 0]
             results_dict['seg_preds'] = out_seg_preds
 
             # unmold box outputs. loop over patches.
             for pix, pc in enumerate(patch_crops):
                 patch_boxes = patches_dict['boxes'][pix]
 
                 for box in patch_boxes:
 
                     # add unique patch id for consolidation of predictions.
                     box['patch_id'] = self.rank_ix + '_' + n_aug + '_' + str(pix)
 
                     # boxes from the edges of a patch have a lower prediction quality, than the ones at patch-centers.
                     # hence they will be downweighted for consolidation, using the 'box_patch_center_factor', which is
                     # obtained by a normal distribution over positions in the patch and average over spatial dimensions.
                     # Also the info 'box_n_overlaps' is stored for consolidation, which depicts the amount over
                     # overlapping patches at the box's position.
                     c = box['box_coords']
                     box_centers = [(c[ii] + c[ii + 2]) / 2 for ii in range(2)]
                     if self.cf.dim == 3:
                         box_centers.append((c[4] + c[5]) / 2)
                     box['box_patch_center_factor'] = np.mean(
                         [norm.pdf(bc, loc=pc, scale=pc * 0.8) * np.sqrt(2 * np.pi) * pc * 0.8 for bc, pc in
                          zip(box_centers, np.array(self.cf.patch_size) / 2)])
                     if self.cf.dim == 3:
                         c += np.array([pc[0], pc[2], pc[0], pc[2], pc[4], pc[4]])
                         int_c = [int(np.floor(ii)) if ix%2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(patch_overlap_map[:, :, int_c[1]:int_c[3], int_c[0]:int_c[2], int_c[4]:int_c[5]])
                         results_dict['boxes'][0].append(box)
                     else:
                         c += np.array([pc[0], pc[2], pc[0], pc[2]])
                         int_c = [int(np.floor(ii)) if ix % 2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)]
                         box['box_n_overlaps'] = np.mean(patch_overlap_map[pc[4], :, int_c[1]:int_c[3], int_c[0]:int_c[2]])
                         results_dict['boxes'][pc[4]].append(box)
 
             if self.mode == 'val':
-                results_dict['monitor_values'] = patches_dict['monitor_values']
-
+                try:
+                    results_dict['torch_loss'] = patches_dict['torch_loss']
+                    results_dict['class_loss'] = patches_dict['class_loss']
+                except KeyError:
+                    pass
         # if predictions are not patch-based:
         # add patch-origin info to boxes (entire image is the same patch with overlap=1) and return results.
         else:
             results_dict = self.batch_tiling_forward(batch)
             for b in results_dict['boxes']:
                 for box in b:
                     box['box_patch_center_factor'] = 1
                     box['box_n_overlaps'] = 1
                     box['patch_id'] = self.rank_ix + '_' + n_aug
 
         return results_dict
 
 
     def batch_tiling_forward(self, batch):
         """
         calls the actual network forward method. in patch-based prediction, the batch dimension might be overladed
         with n_patches >> batch_size, which would exceed gpu memory. In this case, batches are processed in chunks of
         batch_size. validation mode calls the train method to monitor losses (returned ground truth objects are discarded).
         test mode calls the test forward method, no ground truth required / involved.
         :return. results_dict: stores the results for one patient. dictionary with keys:
                  - 'boxes': list over batch elements. each element is a list over boxes, where each box is
                             one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions,
                             and a dummy batch dimension of 1 for 3D predictions.
                  - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z))
-                 - monitor_values (only in validation mode)
+                 - losses (only in validation mode)
         """
         self.logger.info('forwarding (patched) patient with shape: {}'.format(batch['data'].shape))
 
         img = batch['data']
 
         if img.shape[0] <= self.cf.batch_size:
 
             if self.mode == 'val':
                 # call training method to monitor losses
                 results_dict = self.net.train_forward(batch, is_validation=True)
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
             else:
                 results_dict = self.net.test_forward(batch, return_masks=self.cf.return_masks_in_test)
 
         else:
             split_ixs = np.split(np.arange(img.shape[0]), np.arange(img.shape[0])[::self.cf.batch_size])
             chunk_dicts = []
             for chunk_ixs in split_ixs[1:]:  # first split is elements before 0, so empty
                 b = {k: batch[k][chunk_ixs] for k in batch.keys()
                      if (isinstance(batch[k], np.ndarray) and batch[k].shape[0] == img.shape[0])}
                 if self.mode == 'val':
                     chunk_dicts += [self.net.train_forward(b, is_validation=True)]
                 else:
                     chunk_dicts += [self.net.test_forward(b, return_masks=self.cf.return_masks_in_test)]
 
 
             results_dict = {}
             # flatten out batch elements from chunks ([chunk, chunk] -> [b, b, b, b, ...])
             results_dict['boxes'] = [item for d in chunk_dicts for item in d['boxes']]
             results_dict['seg_preds'] = np.array([item for d in chunk_dicts for item in d['seg_preds']])
 
             if self.mode == 'val':
-                # estimate metrics by mean over batch_chunks. Most similar to training metrics.
-                results_dict['monitor_values'] = \
-                    {k:np.mean([d['monitor_values'][k] for d in chunk_dicts])
-                     for k in chunk_dicts[0]['monitor_values'].keys()}
+                try:
+                    # estimate metrics by mean over batch_chunks. Most similar to training metrics.
+                    results_dict['torch_loss'] = torch.mean(torch.cat([d['torch_loss'] for d in chunk_dicts]))
+                    results_dict['class_loss'] = np.mean([d['class_loss'] for d in chunk_dicts])
+                except KeyError:
+                    # losses are not necessarily monitored
+                    pass
                 # discard returned ground-truth boxes (also training info boxes).
                 results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']]
 
         return results_dict
 
 
 
 def apply_wbc_to_patient(inputs):
     """
     wrapper around prediction box consolidation: weighted cluster scoring (wcs). processes a single patient.
     loops over batch elements in patient results (1 in 3D, slices in 2D) and foreground classes,
     aggregates and stores results in new list.
     :return. patient_results_list: list over batch elements. each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D
                                  predictions, and a dummy batch dimension of 1 for 3D predictions.
     :return. pid: string. patient id.
     """
     in_patient_results_list, pid, class_dict, wcs_iou, n_ens = inputs
     out_patient_results_list = [[] for _ in range(len(in_patient_results_list))]
 
     for bix, b in enumerate(in_patient_results_list):
 
         for cl in list(class_dict.keys()):
 
             boxes = [(ix, box) for ix, box in enumerate(b) if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             box_coords = np.array([b[1]['box_coords'] for b in boxes])
             box_scores = np.array([b[1]['box_score'] for b in boxes])
             box_center_factor = np.array([b[1]['box_patch_center_factor'] for b in boxes])
             box_n_overlaps = np.array([b[1]['box_n_overlaps'] for b in boxes])
             box_patch_id = np.array([b[1]['patch_id'] for b in boxes])
 
             if 0 not in box_scores.shape:
                 keep_scores, keep_coords = weighted_box_clustering(
                     np.concatenate((box_coords, box_scores[:, None], box_center_factor[:, None],
                                     box_n_overlaps[:, None]), axis=1), box_patch_id, wcs_iou, n_ens)
 
                 for boxix in range(len(keep_scores)):
                     out_patient_results_list[bix].append({'box_type': 'det', 'box_coords': keep_coords[boxix],
                                              'box_score': keep_scores[boxix], 'box_pred_class_id': cl})
 
         # add gt boxes back to new output list.
         out_patient_results_list[bix].extend([box for box in b if box['box_type'] == 'gt'])
 
     return [out_patient_results_list, pid]
 
 
 
 def merge_2D_to_3D_preds_per_patient(inputs):
     """
     wrapper around 2Dto3D merging operation. Processes a single patient. Takes 2D patient results (slices in batch dimension)
     and returns 3D patient results (dummy batch dimension of 1). Applies an adaption of Non-Maximum Surpression
     (Detailed methodology is described in nms_2to3D).
     :return. results_dict_boxes: list over batch elements (1 in 3D). each element is a list over boxes, where each box is
                                  one dictionary: [[box_0, ...], [box_n,...]].
     :return. pid: string. patient id.
     """
     in_patient_results_list, pid, class_dict, merge_3D_iou = inputs
     out_patient_results_list = []
 
     for cl in list(class_dict.keys()):
         boxes, slice_ids = [], []
         # collect box predictions over batch dimension (slices) and store slice info as slice_ids.
         for bix, b in enumerate(in_patient_results_list):
             det_boxes = [(ix, box) for ix, box in enumerate(b) if
                      (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]
             boxes += det_boxes
             slice_ids += [bix] * len(det_boxes)
 
         box_coords = np.array([b[1]['box_coords'] for b in boxes])
         box_scores = np.array([b[1]['box_score'] for b in boxes])
         slice_ids = np.array(slice_ids)
 
         if 0 not in box_scores.shape:
             keep_ix, keep_z = nms_2to3D(
                 np.concatenate((box_coords, box_scores[:, None], slice_ids[:, None]), axis=1), merge_3D_iou)
         else:
             keep_ix, keep_z = [], []
 
         # store kept predictions in new results list and add corresponding z-dimension info to coordinates.
         for kix, kz in zip(keep_ix, keep_z):
             out_patient_results_list.append({'box_type': 'det', 'box_coords': list(box_coords[kix]) + kz,
                                              'box_score': box_scores[kix], 'box_pred_class_id': cl})
 
     out_patient_results_list += [box for b in in_patient_results_list for box in b if box['box_type'] == 'gt']
     out_patient_results_list = [out_patient_results_list] # add dummy batch dimension 1 for 3D.
 
     return [out_patient_results_list, pid]
 
 
 
 def weighted_box_clustering(dets, box_patch_id, thresh, n_ens):
     """
     consolidates overlapping predictions resulting from patch overlaps, test data augmentations and temporal ensembling.
     clusters predictions together with iou > thresh (like in NMS). Output score and coordinate for one cluster are the
     average weighted by individual patch center factors (how trustworthy is this candidate measured by how centered
     its position the patch is) and the size of the corresponding box.
     The number of expected predictions at a position is n_data_aug * n_temp_ens * n_overlaps_at_position
     (1 prediction per unique patch). Missing predictions at a cluster position are defined as the number of unique
     patches in the cluster, which did not contribute any predict any boxes.
     :param dets: (n_dets, (y1, x1, y2, x2, (z1), (z2), scores, box_pc_facts, box_n_ovs)
     :param thresh: threshold for iou_matching.
     :param n_ens: number of models, that are ensembled. (-> number of expected predicitions per position)
     :return: keep_scores: (n_keep)  new scores of boxes to be kept.
     :return: keep_coords: (n_keep, (y1, x1, y2, x2, (z1), (z2)) new coordinates of boxes to be kept.
     """
     dim = 2 if dets.shape[1] == 7 else 3
     y1 = dets[:, 0]
     x1 = dets[:, 1]
     y2 = dets[:, 2]
     x2 = dets[:, 3]
     scores = dets[:, -3]
     box_pc_facts = dets[:, -2]
     box_n_ovs = dets[:, -1]
 
     areas = (y2 - y1 + 1) * (x2 - x1 + 1)
 
     if dim == 3:
         z1 = dets[:, 4]
         z2 = dets[:, 5]
         areas *= (z2 - z1 + 1)
 
     # order is the sorted index.  maps order to index o[1] = 24 (rank1, ix 24)
     order = scores.argsort()[::-1]
 
     keep = []
     keep_scores = []
     keep_coords = []
 
     while order.size > 0:
         i = order[0]  # higehst scoring element
         xx1 = np.maximum(x1[i], x1[order])
         yy1 = np.maximum(y1[i], y1[order])
         xx2 = np.minimum(x2[i], x2[order])
         yy2 = np.minimum(y2[i], y2[order])
 
         w = np.maximum(0.0, xx2 - xx1 + 1)
         h = np.maximum(0.0, yy2 - yy1 + 1)
         inter = w * h
 
         if dim == 3:
             zz1 = np.maximum(z1[i], z1[order])
             zz2 = np.minimum(z2[i], z2[order])
             d = np.maximum(0.0, zz2 - zz1 + 1)
             inter *= d
 
         # overall between currently highest scoring box and all boxes.
         ovr = inter / (areas[i] + areas[order] - inter)
 
         # get all the predictions that match the current box to build one cluster.
         matches = np.argwhere(ovr > thresh)
 
         match_n_ovs = box_n_ovs[order[matches]]
         match_pc_facts = box_pc_facts[order[matches]]
         match_patch_id = box_patch_id[order[matches]]
         match_ov_facts = ovr[matches]
         match_areas = areas[order[matches]]
         match_scores = scores[order[matches]]
 
         # weight all socres in cluster by patch factors, and size.
         match_score_weights = match_ov_facts * match_areas * match_pc_facts
         match_scores *= match_score_weights
 
         # for the weigted average, scores have to be divided by the number of total expected preds at the position
         # of the current cluster. 1 Prediction per patch is expected. therefore, the number of ensembled models is
         # multiplied by the mean overlaps of  patches at this position (boxes of the cluster might partly be
         # in areas of different overlaps).
         n_expected_preds = n_ens * np.mean(match_n_ovs)
 
         # the number of missing predictions is obtained as the number of patches,
         # which did not contribute any prediction to the current cluster.
         n_missing_preds = np.max((0, n_expected_preds - np.unique(match_patch_id).shape[0]))
 
         # missing preds are given the mean weighting
         # (expected prediction is the mean over all predictions in cluster).
         denom = np.sum(match_score_weights) + n_missing_preds * np.mean(match_score_weights)
 
         # compute weighted average score for the cluster
         avg_score = np.sum(match_scores) / denom
 
         # compute weighted average of coordinates for the cluster. now only take existing
         # predictions into account.
         avg_coords = [np.sum(y1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x1[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(y2[order[matches]] * match_scores) / np.sum(match_scores),
                       np.sum(x2[order[matches]] * match_scores) / np.sum(match_scores)]
         if dim == 3:
             avg_coords.append(np.sum(z1[order[matches]] * match_scores) / np.sum(match_scores))
             avg_coords.append(np.sum(z2[order[matches]] * match_scores) / np.sum(match_scores))
 
         # some clusters might have very low scores due to high amounts of missing predictions.
         # filter out the with a conservative threshold, to speed up evaluation.
         if avg_score > 0.01:
             keep_scores.append(avg_score)
             keep_coords.append(avg_coords)
 
         # get index of all elements that were not matched and discard all others.
         inds = np.where(ovr <= thresh)[0]
         order = order[inds]
 
     return keep_scores, keep_coords
 
 
 
 def nms_2to3D(dets, thresh):
     """
     Merges 2D boxes to 3D cubes. Therefore, boxes of all slices are projected into one slices. An adaptation of Non-maximum surpression
     is applied, where clusters are found (like in NMS) with an extra constrained, that surpressed boxes have to have 'connected'
     z-coordinates w.r.t the core slice (cluster center, highest scoring box). 'connected' z-coordinates are determined
     as the z-coordinates with predictions until the first coordinate, where no prediction was found.
 
     example: a cluster of predictions was found overlap > iou thresh in xy (like NMS). The z-coordinate of the highest
     scoring box is 50. Other predictions have 23, 46, 48, 49, 51, 52, 53, 56, 57.
     Only the coordinates connected with 50 are clustered to one cube: 48, 49, 51, 52, 53. (46 not because nothing was
     found in 47, so 47 is a 'hole', which interrupts the connection). Only the boxes corresponding to these coordinates
     are surpressed. All others are kept for building of further clusters.
 
     This algorithm works better with a certain min_confidence of predictions, because low confidence (e.g. noisy/cluttery)
     predictions can break the relatively strong assumption of defining cubes' z-boundaries at the first 'hole' in the cluster.
 
     :param dets: (n_detections, (y1, x1, y2, x2, scores, slice_id)
     :param thresh: iou matchin threshold (like in NMS).
     :return: keep: (n_keep) 1D tensor of indices to be kept.
     :return: keep_z: (n_keep, [z1, z2]) z-coordinates to be added to boxes, which are kept in order to form cubes.
     """
     y1 = dets[:, 0]
     x1 = dets[:, 1]
     y2 = dets[:, 2]
     x2 = dets[:, 3]
     scores = dets[:, -2]
     slice_id = dets[:, -1]
 
     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
     order = scores.argsort()[::-1]
 
     keep = []
     keep_z = []
 
     while order.size > 0:  # order is the sorted index.  maps order to index o[1] = 24 (rank1, ix 24)
         i = order[0]  # pop higehst scoring element
         xx1 = np.maximum(x1[i], x1[order])
         yy1 = np.maximum(y1[i], y1[order])
         xx2 = np.minimum(x2[i], x2[order])
         yy2 = np.minimum(y2[i], y2[order])
 
         w = np.maximum(0.0, xx2 - xx1 + 1)
         h = np.maximum(0.0, yy2 - yy1 + 1)
         inter = w * h
 
         ovr = inter / (areas[i] + areas[order] - inter)
         matches = np.argwhere(ovr > thresh)  # get all the elements that match the current box and have a lower score
 
         slice_ids = slice_id[order[matches]]
         core_slice = slice_id[int(i)]
         upper_wholes = [ii for ii in np.arange(core_slice, np.max(slice_ids)) if ii not in slice_ids]
         lower_wholes = [ii for ii in np.arange(np.min(slice_ids), core_slice) if ii not in slice_ids]
         max_valid_slice_id = np.min(upper_wholes) if len(upper_wholes) > 0 else np.max(slice_ids)
         min_valid_slice_id = np.max(lower_wholes) if len(lower_wholes) > 0 else np.min(slice_ids)
         z_matches = matches[(slice_ids <= max_valid_slice_id) & (slice_ids >= min_valid_slice_id)]
 
         z1 = np.min(slice_id[order[z_matches]]) - 1
         z2 = np.max(slice_id[order[z_matches]]) + 1
 
         keep.append(i)
         keep_z.append([z1, z2])
         order = np.delete(order, z_matches, axis=0)
 
     return keep, keep_z
 
 
 
 def get_mirrored_patch_crops(patch_crops, org_img_shape):
     """
     apply 3 mirrror transformations (x-axis, y-axis, x&y-axis)
     to given patch crop coordinates and return the transformed coordinates.
     Handles 2D and 3D coordinates.
     :param patch_crops: list of crops: each element is a list of coordinates for given crop [[y1, x1, ...], [y1, x1, ..]]
     :param org_img_shape: shape of patient volume used as world coordinates.
     :return: list of mirrored patch crops: lenght=3. each element is a list of transformed patch crops.
     """
     mirrored_patch_crops = []
 
     # y-axis transform.
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   ii[2], ii[3]] if len(ii) == 4 else
                                  [org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   ii[2], ii[3], ii[4], ii[5]] for ii in patch_crops])
 
     # x-axis transform.
     mirrored_patch_crops.append([[ii[0], ii[1],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2]] if len(ii) == 4 else
                                  [ii[0], ii[1],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2],
                                   ii[4], ii[5]] for ii in patch_crops])
 
     # y-axis and x-axis transform.
     mirrored_patch_crops.append([[org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2]] if len(ii) == 4 else
                                  [org_img_shape[2] - ii[1],
                                   org_img_shape[2] - ii[0],
                                   org_img_shape[3] - ii[3],
                                   org_img_shape[3] - ii[2],
                                   ii[4], ii[5]] for ii in patch_crops])
 
     return mirrored_patch_crops
 
 
 
diff --git a/utils/exp_utils.py b/utils/exp_utils.py
index 58c8c3e..61d6544 100644
--- a/utils/exp_utils.py
+++ b/utils/exp_utils.py
@@ -1,418 +1,420 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import sys
 import subprocess
 import os
 
 import plotting
 import importlib.util
 import pickle
 
 import logging
 from torch.utils.tensorboard import SummaryWriter
 
 from collections import OrderedDict
 import numpy as np
 import torch
 import pandas as pd
 
 
 class CombinedLogger(object):
     """Combine console and tensorboard logger and record system metrics.
     """
 
     def __init__(self, name, log_dir, server_env=True, fold="all"):
         self.pylogger = logging.getLogger(name)
         self.tboard = SummaryWriter(log_dir=os.path.join(log_dir, "tboard"))
         self.log_dir = log_dir
         self.fold = str(fold)
         self.server_env = server_env
 
         self.pylogger.setLevel(logging.DEBUG)
         self.log_file = os.path.join(log_dir, "fold_"+self.fold, 'exec.log')
         os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
         self.pylogger.addHandler(logging.FileHandler(self.log_file))
         if not server_env:
             self.pylogger.addHandler(ColorHandler())
         else:
             self.pylogger.addHandler(logging.StreamHandler())
         self.pylogger.propagate = False
 
     def __getattr__(self, attr):
         """delegate all undefined method requests to objects of
         this class in order pylogger, tboard (first find first serve).
         E.g., combinedlogger.add_scalars(...) should trigger self.tboard.add_scalars(...)
         """
         for obj in [self.pylogger, self.tboard]:
             if attr in dir(obj):
                 return getattr(obj, attr)
         print("logger attr not found")
 
     def set_logfile(self, fold=None, log_file=None):
         if fold is not None:
             self.fold = str(fold)
         if log_file is None:
             self.log_file = os.path.join(self.log_dir, "fold_"+self.fold, 'exec.log')
         else:
             self.log_file = log_file
         os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
         for hdlr in self.pylogger.handlers:
             hdlr.close()
         self.pylogger.handlers = []
         self.pylogger.addHandler(logging.FileHandler(self.log_file))
         if not self.server_env:
             self.pylogger.addHandler(ColorHandler())
         else:
             self.pylogger.addHandler(logging.StreamHandler())
 
     def metrics2tboard(self, metrics, global_step=None, suptitle=None):
         """
         :param metrics: {'train': dataframe, 'val':df}, df as produced in
             evaluator.py.evaluate_predictions
         """
         # print("metrics", metrics)
         if global_step is None:
             global_step = len(metrics['train'][list(metrics['train'].keys())[0]]) - 1
         if suptitle is not None:
             suptitle = str(suptitle)
         else:
             suptitle = "Fold_" + str(self.fold)
 
         for key in ['train', 'val']:
             # series = {k:np.array(v[-1]) for (k,v) in metrics[key].items() if not np.isnan(v[-1]) and not 'Bin_Stats' in k}
             loss_series = {}
             unc_series = {}
             bin_stat_series = {}
             mon_met_series = {}
             for tag, val in metrics[key].items():
                 val = val[-1]  # maybe remove list wrapping, recording in evaluator?
                 if 'loss' in tag.lower() and not np.isnan(val):
                     loss_series["{}".format(tag)] = val
                 elif not np.isnan(val):
                     mon_met_series["{}".format(tag)] = val
 
             self.tboard.add_scalars(suptitle + "/Losses/{}".format(key), loss_series, global_step)
             self.tboard.add_scalars(suptitle + "/Monitor_Metrics/{}".format(key), mon_met_series, global_step)
         self.tboard.add_scalars(suptitle + "/Learning_Rate", metrics["lr"], global_step)
         return
 
     def __del__(self):  # otherwise might produce multiple prints e.g. in ipython console
         for hdlr in self.pylogger.handlers:
             hdlr.close()
         self.pylogger.handlers = []
         del self.pylogger
         self.tboard.flush()
         # close somehow prevents main script from exiting
         # maybe revise this issue in a later pytorch version
         #self.tboard.close()
 
 
 def get_logger(exp_dir, server_env=False):
     """
     creates logger instance. writing out info to file, to terminal and to tensorboard.
     :param exp_dir: experiment directory, where exec.log file is stored.
     :param server_env: True if operating in server environment (e.g., gpu cluster)
     :return: custom CombinedLogger instance.
     """
     log_dir = os.path.join(exp_dir, "logs")
     logger = CombinedLogger('medicaldetectiontoolkit', log_dir, server_env=server_env)
     print("Logging to {}".format(logger.log_file))
     return logger
 
 
 def prep_exp(dataset_path, exp_path, server_env, use_stored_settings=True, is_training=True):
     """
     I/O handling, creating of experiment folder structure. Also creates a snapshot of configs/model scripts and copies them to the exp_dir.
     This way the exp_dir contains all info needed to conduct an experiment, independent to changes in actual source code. Thus, training/inference of this experiment can be started at anytime. Therefore, the model script is copied back to the source code dir as tmp_model (tmp_backbone).
     Provides robust structure for cloud deployment.
     :param dataset_path: path to source code for specific data set. (e.g. medicaldetectiontoolkit/lidc_exp)
     :param exp_path: path to experiment directory.
     :param server_env: boolean flag. pass to configs script for cloud deployment.
     :param use_stored_settings: boolean flag. When starting training: If True, starts training from snapshot in existing experiment directory, else creates experiment directory on the fly using configs/model scripts from source code.
     :param is_training: boolean flag. distinguishes train vs. inference mode.
     :return:
     """
 
     if is_training:
         if use_stored_settings:
             cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py'))
             cf = cf_file.configs(server_env)
             # in this mode, previously saved model and backbone need to be found in exp dir.
             if not os.path.isfile(os.path.join(exp_path, 'model.py')) or \
                     not os.path.isfile(os.path.join(exp_path, 'backbone.py')):
                 raise Exception(
                     "Selected use_stored_settings option but no model and/or backbone source files exist in exp dir.")
             cf.model_path = os.path.join(exp_path, 'model.py')
             cf.backbone_path = os.path.join(exp_path, 'backbone.py')
         else:
             # this case overwrites settings files in exp dir, i.e., default_configs, configs, backbone, model
             os.makedirs(exp_path, exist_ok=True)
             # run training with source code info and copy snapshot of model to exp_dir for later testing (overwrite scripts if exp_dir already exists.)
             subprocess.call('cp {} {}'.format('default_configs.py', os.path.join(exp_path, 'default_configs.py')),
                             shell=True)
             subprocess.call(
                 'cp {} {}'.format(os.path.join(dataset_path, 'configs.py'), os.path.join(exp_path, 'configs.py')),
                 shell=True)
             cf_file = import_module('cf_file', os.path.join(dataset_path, 'configs.py'))
             cf = cf_file.configs(server_env)
             subprocess.call('cp {} {}'.format(cf.model_path, os.path.join(exp_path, 'model.py')), shell=True)
             subprocess.call('cp {} {}'.format(cf.backbone_path, os.path.join(exp_path, 'backbone.py')), shell=True)
             if os.path.isfile(os.path.join(exp_path, "fold_ids.pickle")):
                 subprocess.call('rm {}'.format(os.path.join(exp_path, "fold_ids.pickle")), shell=True)
 
     else:
         # testing, use model and backbone stored in exp dir.
         cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py'))
         cf = cf_file.configs(server_env)
         cf.model_path = os.path.join(exp_path, 'model.py')
         cf.backbone_path = os.path.join(exp_path, 'backbone.py')
 
 
     cf.exp_dir = exp_path
     cf.test_dir = os.path.join(cf.exp_dir, 'test')
     cf.plot_dir = os.path.join(cf.exp_dir, 'plots')
     if not os.path.exists(cf.test_dir):
         os.mkdir(cf.test_dir)
     if not os.path.exists(cf.plot_dir):
         os.mkdir(cf.plot_dir)
     cf.experiment_name = exp_path.split("/")[-1]
     cf.server_env = server_env
     cf.created_fold_id_pickle = False
 
     return cf
 
 
 
 def import_module(name, path):
     """
     correct way of importing a module dynamically in python 3.
     :param name: name given to module instance.
     :param path: path to module.
     :return: module: returned module instance.
     """
     spec = importlib.util.spec_from_file_location(name, path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
 
 
 
 class ModelSelector:
     '''
     saves a checkpoint after each epoch as 'last_state' (can be loaded to continue interrupted training).
     saves the top-k (k=cf.save_n_models) ranked epochs. In inference, predictions of multiple epochs can be ensembled to improve performance.
     '''
 
     def __init__(self, cf, logger):
 
         self.cf = cf
         self.saved_epochs = [-1] * cf.save_n_models
         self.logger = logger
 
     def run_model_selection(self, net, optimizer, monitor_metrics, epoch):
 
         # take the mean over all selection criteria in each epoch
         non_nan_scores = np.mean(np.array([[0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][sc]] for sc in self.cf.model_selection_criteria]), 0)
         epochs_scores = [ii for ii in non_nan_scores[1:]]
         # ranking of epochs according to model_selection_criterion
         epoch_ranking = np.argsort(epochs_scores, kind="stable")[::-1] + 1 #epochs start at 1
         # if set in configs, epochs < min_save_thresh are discarded from saving process.
         epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh]
 
         # check if current epoch is among the top-k epochs.
         if epoch in epoch_ranking[:self.cf.save_n_models]:
 
             save_dir = os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch))
             if not os.path.exists(save_dir):
                 os.mkdir(save_dir)
 
             torch.save(net.state_dict(), os.path.join(save_dir, 'params.pth'))
             with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
                 pickle.dump(monitor_metrics, handle)
             # save epoch_ranking to keep info for inference.
             np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
             np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
 
             self.logger.info(
                 "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch)))
             # delete params of the epoch that just fell out of the top-k epochs.
             for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_checkpoint' in ii]:
                 if se in epoch_ranking[self.cf.save_n_models:]:
                     subprocess.call('rm -rf {}'.format(os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(se))), shell=True)
                     self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se)))
 
         state = {
             'epoch': epoch,
             'state_dict': net.state_dict(),
             'optimizer': optimizer.state_dict(),
         }
 
         # save checkpoint of current epoch.
         save_dir = os.path.join(self.cf.fold_dir, 'last_checkpoint'.format(epoch))
         if not os.path.exists(save_dir):
             os.mkdir(save_dir)
         torch.save(state, os.path.join(save_dir, 'params.pth'))
         np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
         with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle:
             pickle.dump(monitor_metrics, handle)
 
 
 
 def load_checkpoint(checkpoint_path, net, optimizer):
 
     checkpoint_params = torch.load(os.path.join(checkpoint_path, 'params.pth'))
     net.load_state_dict(checkpoint_params['state_dict'])
     optimizer.load_state_dict(checkpoint_params['optimizer'])
     with open(os.path.join(checkpoint_path, 'monitor_metrics.pickle'), 'rb') as handle:
         monitor_metrics = pickle.load(handle)
     starting_epoch = checkpoint_params['epoch'] + 1
     return starting_epoch, monitor_metrics
 
 
 
 def prepare_monitoring(cf):
     """
     creates dictionaries, where train/val metrics are stored.
     """
     metrics = {}
     # first entry for loss dict accounts for epoch starting at 1.
     metrics['train'] = OrderedDict()
     metrics['val'] = OrderedDict()
     metric_classes = []
     if 'rois' in cf.report_score_level:
         metric_classes.extend([v for k, v in cf.class_dict.items()])
     if 'patient' in cf.report_score_level:
         metric_classes.extend(['patient'])
     for cl in metric_classes:
         metrics['train'][cl + '_ap'] = [np.nan]
         metrics['val'][cl + '_ap'] = [np.nan]
         if cl == 'patient':
             metrics['train'][cl + '_auc'] = [np.nan]
             metrics['val'][cl + '_auc'] = [np.nan]
 
     return metrics
 
 
 
 def create_csv_output(results_list, cf, logger):
     """
     Write out test set predictions to .csv file. output format is one line per prediction:
     PatientID | PredictionID | [y1 x1 y2 x2 (z1) (z2)] | score | pred_classID
     Note, that prediction coordinates correspond to images as loaded for training/testing and need to be adapted when
     plotted over raw data (before preprocessing/resampling).
     :param results_list: [[patient_results, patient_id], [patient_results, patient_id], ...]
     """
 
     logger.info('creating csv output file at {}'.format(os.path.join(cf.exp_dir, 'results.csv')))
     predictions_df = pd.DataFrame(columns = ['patientID', 'predictionID', 'coords', 'score', 'pred_classID'])
     for r in results_list:
 
         pid = r[1]
 
         #optionally load resampling info from preprocessing to match output predictions with raw data.
         #with open(os.path.join(cf.exp_dir, 'test_resampling_info', pid), 'rb') as handle:
         #    resampling_info = pickle.load(handle)
 
         for bix, box in enumerate(r[0][0]):
+            if box["box_type"] == "gt":
+                continue
             assert box['box_type'] == 'det', box['box_type']
             coords = box['box_coords']
             score = box['box_score']
             pred_class_id = box['box_pred_class_id']
             out_coords = []
             if score >= cf.min_det_thresh:
                 out_coords.append(coords[0]) #* resampling_info['scale'][0])
                 out_coords.append(coords[1]) #* resampling_info['scale'][1])
                 out_coords.append(coords[2]) #* resampling_info['scale'][0])
                 out_coords.append(coords[3]) #* resampling_info['scale'][1])
                 if len(coords) > 4:
                     out_coords.append(coords[4]) #* resampling_info['scale'][2] + resampling_info['z_crop'])
                     out_coords.append(coords[5]) #* resampling_info['scale'][2] + resampling_info['z_crop'])
 
                 predictions_df.loc[len(predictions_df)] = [pid, bix, out_coords, score, pred_class_id]
     try:
         fold = cf.fold
     except:
         fold = 'hold_out'
     predictions_df.to_csv(os.path.join(cf.exp_dir, 'results_{}.csv'.format(fold)), index=False)
 
 
 
 class _AnsiColorizer(object):
     """
     A colorizer is an object that loosely wraps around a stream, allowing
     callers to write text to the stream in a particular color.
 
     Colorizer classes must implement C{supported()} and C{write(text, color)}.
     """
     _colors = dict(black=30, red=31, green=32, yellow=33,
                    blue=34, magenta=35, cyan=36, white=37, default=39)
 
     def __init__(self, stream):
         self.stream = stream
 
     @classmethod
     def supported(cls, stream=sys.stdout):
         """
         A class method that returns True if the current platform supports
         coloring terminal output using this method. Returns False otherwise.
         """
         if not stream.isatty():
             return False  # auto color only on TTYs
         try:
             import curses
         except ImportError:
             return False
         else:
             try:
                 try:
                     return curses.tigetnum("colors") > 2
                 except curses.error:
                     curses.setupterm()
                     return curses.tigetnum("colors") > 2
             except:
                 raise
                 # guess false in case of error
                 return False
 
     def write(self, text, color):
         """
         Write the given text to the stream in the given color.
 
         @param text: Text to be written to the stream.
 
         @param color: A string label for a color. e.g. 'red', 'white'.
         """
         color = self._colors[color]
         self.stream.write('\x1b[%sm%s\x1b[0m' % (color, text))
 
 
 
 class ColorHandler(logging.StreamHandler):
 
 
     def __init__(self, stream=sys.stdout):
         super(ColorHandler, self).__init__(_AnsiColorizer(stream))
 
     def emit(self, record):
         msg_colors = {
             logging.DEBUG: "green",
             logging.INFO: "default",
             logging.WARNING: "red",
             logging.ERROR: "red"
         }
         color = msg_colors.get(record.levelno, "blue")
         self.stream.write(record.msg + "\n", color)