diff --git a/evaluator.py b/evaluator.py
index 682d686..89ca084 100644
--- a/evaluator.py
+++ b/evaluator.py
@@ -1,971 +1,967 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import os
 from multiprocessing import Pool
 import pickle
 import time
 
 import numpy as np
 import pandas as pd
 from sklearn.metrics import roc_auc_score, average_precision_score
 from sklearn.metrics import roc_curve, precision_recall_curve
 from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
 import torch
 
 import utils.model_utils as mutils
 import plotting as plg
 
 import warnings
 
 
 def get_roi_ap_from_df(inputs):
     '''
     :param df: data frame.
     :param det_thresh: min_threshold for filtering out low confidence predictions.
     :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results,
     instead of computing one ap over whole data set.
     :return: average_precision (float)
     '''
 
     df, det_thresh, per_patient_ap = inputs
 
     if per_patient_ap:
         pids_list = df.pid.unique()
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             for pid in pids_list:
                 pid_df = iou_df[iou_df.pid == pid]
                 all_p = len(pid_df[pid_df.class_label == 1])
                 pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
                 pid_df = pid_df[pid_df.pred_score > det_thresh]
                 if (len(pid_df) ==0 and all_p == 0):
                    pass
                 elif (len(pid_df) > 0 and all_p == 0):
                     aps.append(0)
                 else:
                     aps.append(compute_roi_ap(pid_df, all_p))
         return np.mean(aps)
 
     else:
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             # it's important to not apply the threshold before counting all_p in order to not lose the fn!
             all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')])
             # sorting out all entries that are not fp or tp or have confidence(=pred_score) <= detection_threshold
             iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
             iou_df = iou_df[iou_df.pred_score > det_thresh]
             if all_p>0:
                 aps.append(compute_roi_ap(iou_df, all_p))
         return np.mean(aps)
 
 def compute_roi_ap(df, all_p):
     """
     adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
     :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score.
     :param all_p: number of all ground truth objects. (for denominator of recall.)
     :return:
     """
     tp = df.class_label.values
     fp = (tp == 0) * 1
     #recall thresholds, where precision will be measured
     R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)
     tp_sum = np.cumsum(tp)
     fp_sum = np.cumsum(fp)
     n_dets = len(tp)
     rc = tp_sum / all_p
     pr = tp_sum / (fp_sum + tp_sum)
 
     # initialize precision array over recall steps (q=queries).
     q = [0. for _ in range(len(R))]
     # numpy is slow without cython optimization for accessing elements
     # python array gets significant speed improvement
     pr = pr.tolist()
 
     for i in range(n_dets - 1, 0, -1):
         if pr[i] > pr[i - 1]:
             pr[i - 1] = pr[i]
         #--> pr[i]<=pr[i-1] for all i since we want to consider the maximum
         #precision value for a queried interval
 
     # discretize empiric recall steps with given bins.
     assert np.all(rc[:-1]<=rc[1:]), "recall not sorted ascendingly"
     inds = np.searchsorted(rc, R, side='left')
     try:
         for rc_ix, pr_ix in enumerate(inds):
             q[rc_ix] = pr[pr_ix]
     except IndexError: #now q is filled with pr values up to first non-available index
         pass
 
     return np.mean(q)
 
 def roi_avp(inputs):
     '''
     :param df: data frame.
     :param det_thresh: min_threshold for filtering out low confidence predictions.
     :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results,
     instead of computing one ap over whole data set.
     :return: average_precision (float)
     '''
 
     df, det_thresh, per_patient_ap = inputs
 
     if per_patient_ap:
         pids_list = df.pid.unique()
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             for pid in pids_list:
                 pid_df = iou_df[iou_df.pid == pid]
                 all_p = len(pid_df[pid_df.class_label == 1])
                 mask = ((pid_df.rg_bins == pid_df.rg_bin_target) & (pid_df.det_type == 'det_tp')) | (pid_df.det_type == 'det_fp')
                 pid_df = pid_df[mask].sort_values('pred_score', ascending=False)
                 pid_df = pid_df[pid_df.pred_score > det_thresh]
                 if (len(pid_df) ==0 and all_p == 0):
                    pass
                 elif (len(pid_df) > 0 and all_p == 0):
                     aps.append(0)
                 else:
                     aps.append(compute_roi_ap(pid_df, all_p))
         return np.mean(aps)
 
     else:
         aps = []
         for match_iou in df.match_iou.unique():
             iou_df = df[df.match_iou == match_iou]
             #it's important to not apply the threshold before counting all_positives!
             all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')])
             # filtering out tps which don't match rg_bin target at this point is same as reclassifying them as fn.
             # also sorting out all entries that are not fp or have confidence(=pred_score) <= detection_threshold
             mask = ((iou_df.rg_bins == iou_df.rg_bin_target) & (iou_df.det_type == 'det_tp')) | (iou_df.det_type == 'det_fp')
             iou_df = iou_df[mask].sort_values('pred_score', ascending=False)
             iou_df = iou_df[iou_df.pred_score > det_thresh]
             if all_p>0:
                 aps.append(compute_roi_ap(iou_df, all_p))
 
         return np.mean(aps)
 
 def compute_prc(df):
     """compute precision-recall curve with maximum precision per recall interval.
     :param df:
     :param all_p: # of all positive samples in data.
     :return: array: [precisions, recall query values]
     """
     assert (df.class_label==1).any(), "cannot compute prc when no positives in data."
     all_p = len(df[(df.det_type == 'det_tp') | (df.det_type == 'det_fn')])
     df = df[(df.det_type=="det_tp") | (df.det_type=="det_fp")]
     df = df.sort_values("pred_score", ascending=False)
     # recall thresholds, where precision will be measured
     scores = df.pred_score.values
     labels = df.class_label.values
     n_dets = len(scores)
 
     pr = np.zeros((n_dets,))
     rc = pr.copy()
     for rank in range(n_dets):
         tp = np.count_nonzero(labels[:rank+1]==1)
         fp = np.count_nonzero(labels[:rank+1]==0)
 
         pr[rank] = tp/(tp+fp)
         rc[rank] = tp/all_p
 
     #after obj detection convention/ coco-dataset template: take maximum pr within intervals:
     # --> pr[i]<=pr[i-1] for all i since we want to consider the maximum
     # precision value for a queried interval
     for i in range(n_dets - 1, 0, -1):
         if pr[i] > pr[i - 1]:
             pr[i - 1] = pr[i]
 
     R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)#precision queried at R points
     inds = np.searchsorted(rc, R, side='left')
     queries = np.zeros((len(R),))
     try:
         for q_ix, rank in enumerate(inds):
             queries[q_ix] = pr[rank]
     except IndexError:
         pass
     return np.array((queries, R))
 
 def RMSE(y_true, y_pred, weights=None):
     if len(y_true)>0:
         return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=weights))
     else:
         return np.nan
 
 def MAE_w_std(y_true, y_pred, weights=None):
     if len(y_true)>0:
         y_true, y_pred = np.array(y_true), np.array(y_pred)
         deltas = np.abs(y_true-y_pred)
         mae = np.average(deltas, weights=weights, axis=0).item()
         skmae = mean_absolute_error(y_true, y_pred, sample_weight=weights)
         assert np.allclose(mae, skmae, atol=1e-6), "mae {}, sklearn mae {}".format(mae, skmae)
         std = np.std(weights*deltas)
         return mae, std
 
     else:
         return np.nan, np.nan
 
 def MAE(y_true, y_pred, weights=None):
     if len(y_true)>0:
         return mean_absolute_error(y_true, y_pred, sample_weight=weights)
     else:
         return np.nan
 
 def accuracy(y_true, y_pred, weights=None):
     if len(y_true)>0:
         return accuracy_score(y_true, y_pred, sample_weight=weights)
     else:
         return np.nan
 
 
 # noinspection PyCallingNonCallable
 class Evaluator():
     """ Evaluates given results dicts. Can return results as updated monitor_metrics. Can save test data frames to
         file.
     """
 
     def __init__(self, cf, logger, mode='test'):
         """
         :param mode: either 'train', 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms.
         """
         self.cf = cf
         self.logger = logger
         self.mode = mode
 
         self.regress_flag = any(['regression' in task for task in self.cf.prediction_tasks])
 
         self.plot_dir = self.cf.plot_dir if not self.mode == "test" else self.cf.test_dir
         if self.cf.plot_prediction_histograms:
             self.hist_dir = os.path.join(self.plot_dir, 'histograms')
             os.makedirs(self.hist_dir, exist_ok=True)
         if self.cf.plot_stat_curves:
             self.curves_dir = os.path.join(self.plot_dir, 'stat_curves')
             os.makedirs(self.curves_dir, exist_ok=True)
 
 
     def eval_losses(self, batch_res_dicts):
         if hasattr(self.cf, "losses_to_monitor"):
             loss_names = self.cf.losses_to_monitor
         else:
             loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name}
         self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name
                                                    in b_res_dict.keys()]).mean().item() for l_name in loss_names}
 
     def eval_segmentations(self, batch_res_dicts, pid_list):
 
         batch_dices = [b_res_dict['batch_dices'] for b_res_dict in batch_res_dicts if
                        'batch_dices' in b_res_dict.keys()]  # shape (n_batches, n_seg_classes)
         if len(batch_dices) > 0:
             batch_dices = np.array(batch_dices)  # dims n_batches x 1 in sampling / n_test_epochs x n_classes
             assert batch_dices.shape[1] == self.cf.num_seg_classes, "bdices shp {}, n seg cl {}, pid lst len {}".format(
                 batch_dices.shape, self.cf.num_seg_classes, len(pid_list))
             self.seg_df = pd.DataFrame()
             for seg_id in range(batch_dices.shape[1]):
                 self.seg_df[self.cf.seg_id2label[seg_id].name + "_dice"] = batch_dices[:,
                                                                            seg_id]  # one row== one batch, one column== one class
                 # self.seg_df[self.cf.seg_id2label[seg_id].name+"_dice"] = np.concatenate(batch_dices[:,:,seg_id])
             self.seg_df['fold'] = self.cf.fold
             if self.mode == "val_patient" or self.mode == "test":
                 # need to make it more conform between sampling and patient-mode
                 self.seg_df["pid"] = [pid for pix, pid in enumerate(pid_list)]  # for b_inst in batch_inst_boxes[pix]]
             else:
                 self.seg_df["pid"] = np.nan
 
     def eval_boxes(self, batch_res_dicts, pid_list, obj_cl_dict,
                    obj_cl_identifiers={"gt":'class_targets', "pred":'box_pred_class_id'}):
         """
 
         :param batch_res_dicts:
         :param pid_list: [pid_0, pid_1, ...]
         :return:
         """
         if self.mode == 'train' or self.mode == 'val_sampling':
             # one pid per batch element
             # batch_size > 1, with varying patients across batch:
             # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
             # -> [results_0, results_1, ..]
             batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]  # len: nr of batches in epoch
             batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in
                                 whole_batch_boxes]  # len: batch instances of whole epoch
             assert np.all(len(b_boxes_list) == self.cf.batch_size for b_boxes_list in batch_inst_boxes)
         elif self.mode == "val_patient" or self.mode == "test":
             # patient processing, one element per batch = one patient.
             # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..]
             # in patientbatchiterator there is only one pid per batch
             batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]
             # in patient mode not actually per batch instance, but per whole batch!
             if hasattr(self.cf, "eval_test_separately") and self.cf.eval_test_separately:
                 """ you could write your own routines to add GTs to raw predictions for evaluation.
                     implemented standard is: cf.eval_test_separately = False or not set --> GTs are saved at same time 
                      and in same file as raw prediction results. 
                 """
                 raise NotImplementedError
         assert len(batch_inst_boxes) == len(pid_list)
 
         df_list_preds = []
         df_list_labels = []
         df_list_class_preds = []
         df_list_pids = []
         df_list_type = []
         df_list_match_iou = []
         df_list_n_missing = []
         df_list_regressions = []
         df_list_rg_targets = []
         df_list_rg_bins = []
         df_list_rg_bin_targets = []
         df_list_rg_uncs = []
 
         for match_iou in self.cf.ap_match_ious:
             self.logger.info('evaluating with ap_match_iou: {}'.format(match_iou))
             for cl in list(obj_cl_dict.keys()):
                 for pix, pid in enumerate(pid_list):
                     len_df_list_before_patient = len(df_list_pids)
                     # input of each batch element is a list of boxes, where each box is a dictionary.
                     for b_inst_ix, b_boxes_list in enumerate(batch_inst_boxes[pix]):
 
                         b_tar_boxes = []
                         b_cand_boxes, b_cand_scores, b_cand_n_missing = [], [], []
                         if self.regress_flag:
                             b_tar_regs, b_tar_rg_bins = [], []
                             b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = [], [], []
                         for box in b_boxes_list:
                             # each box is either gt or detection or proposal/anchor
                             # we need all gts in the same order & all dets in same order
                             if box['box_type'] == 'gt' and box[obj_cl_identifiers["gt"]] == cl:
                                 b_tar_boxes.append(box["box_coords"])
                                 if self.regress_flag:
                                     b_tar_regs.append(np.array(box['regression_targets'], dtype='float32'))
                                     b_tar_rg_bins.append(box['rg_bin_targets'])
 
                             if box['box_type'] == 'det' and box[obj_cl_identifiers["pred"]] == cl:
                                 b_cand_boxes.append(box["box_coords"])
                                 b_cand_scores.append(box["box_score"])
                                 b_cand_n_missing.append(box["cluster_n_missing"] if 'cluster_n_missing' in box.keys() else np.nan)
                                 if self.regress_flag:
                                     b_cand_regs.append(box["regression"])
                                     b_cand_rg_bins.append(box["rg_bin"])
                                     b_cand_rg_uncs.append(box["rg_uncertainty"] if 'rg_uncertainty' in box.keys() else np.nan)
                         b_tar_boxes = np.array(b_tar_boxes)
                         b_cand_boxes, b_cand_scores, b_cand_n_missing = np.array(b_cand_boxes), np.array(b_cand_scores), np.array(b_cand_n_missing)
                         if self.regress_flag:
                             b_tar_regs, b_tar_rg_bins = np.array(b_tar_regs), np.array(b_tar_rg_bins)
                             b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = np.array(b_cand_regs), np.array(b_cand_rg_bins), np.array(b_cand_rg_uncs)
 
                         # check if predictions and ground truth boxes exist and match them according to match_iou.
                         if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                             assert np.all(np.round(b_cand_scores,6) <= 1.), "there is a box score>1: {}".format(b_cand_scores[~(b_cand_scores<=1.)])
                             #coords_check = np.array([len(coords)==self.cf.dim*2 for coords in b_cand_boxes])
                             #assert np.all(coords_check), "cand box with wrong bcoords dim: {}, mode: {}".format(b_cand_boxes[~coords_check], self.mode)
                             expected_dim = len(b_cand_boxes[0])
                             assert np.all([len(coords) == expected_dim for coords in b_tar_boxes]), \
                                 "gt/cand box coords mismatch, expected dim: {}.".format(expected_dim)
 
                             # overlaps: shape len(cand_boxes) x len(tar_boxes)
                             overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes)
 
                             # match_cand_ixs: shape (nr_of_matches,)
                             # theses indices are the indices of b_cand_boxes
                             match_cand_ixs = np.argwhere(np.max(overlaps, axis=1) > match_iou)[:, 0]
 
                             non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0]
                             # the corresponding gt assigned to the pred boxes by highest iou overlap,
                             # i.e., match_gt_ixs holds index into b_tar_boxes for each entry in match_cand_ixs,
                             # i.e., gt_ixs and cand_ixs are paired via their position in their list
                             # (cand_ixs[j] corresponds to gt_ixs[j])
                             match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :], axis=1) if \
                                 not 0 in match_cand_ixs.shape else np.array([])
                             assert len(match_gt_ixs)==len(match_cand_ixs)
 
                             #match_gt_ixs: shape (nr_of_matches,) or 0
                             non_match_gt_ixs = np.array(
                                 [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs])
                             unique, counts = np.unique(match_gt_ixs, return_counts=True)
 
                             # check for double assignments, i.e. two predictions having been assigned to the same gt.
                             # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
                             # false positive. This case is supposed to be avoided by the model itself by,
                             #  e.g. using a low enough NMS threshold.
                             if np.any(counts > 1):
                                 double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]]
                                 keep_max = []
                                 double_match_list = []
                                 for dg in double_match_gt_ixs:
                                     double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)]
                                     keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])])
                                     double_match_list += [ii for ii in double_match_cand_ixs]
 
                                 fp_ixs = np.array([ii for ii in match_cand_ixs if
                                                      (ii in double_match_list and ii not in keep_max)])
                                 # count as fp: boxes that match gt above match_iou threshold but have not highest class confidence score
                                 match_gt_ixs = np.array([gt_ix for ii, gt_ix in enumerate(match_gt_ixs) if match_cand_ixs[ii] not in fp_ixs])
                                 match_cand_ixs = np.array([cand_ix for cand_ix in match_cand_ixs if cand_ix not in fp_ixs])
                                 assert len(match_gt_ixs) == len(match_cand_ixs)
 
                                 df_list_preds += [ii for ii in b_cand_scores[fp_ixs]]
                                 df_list_labels += [0] * fp_ixs.shape[0]  # means label==gt==0==bg for all these fp_ixs
                                 df_list_class_preds += [cl] * fp_ixs.shape[0]
                                 df_list_n_missing += [n for n in b_cand_n_missing[fp_ixs]]
                                 if self.regress_flag:
                                     df_list_regressions += [r for r in b_cand_regs[fp_ixs]]
                                     df_list_rg_bins += [r for r in b_cand_rg_bins[fp_ixs]]
                                     df_list_rg_uncs += [r for r in b_cand_rg_uncs[fp_ixs]]
                                     df_list_rg_targets += [[0.]*self.cf.regression_n_features] * fp_ixs.shape[0]
                                     df_list_rg_bin_targets += [0.] * fp_ixs.shape[0]
                                 df_list_pids += [pid] * fp_ixs.shape[0]
                                 df_list_type += ['det_fp'] * fp_ixs.shape[0]
 
                             # matched/tp:
                             if not 0 in match_cand_ixs.shape:
                                 df_list_preds += list(b_cand_scores[match_cand_ixs])
                                 df_list_labels += [1] * match_cand_ixs.shape[0]
                                 df_list_class_preds += [cl] * match_cand_ixs.shape[0]
                                 df_list_n_missing += list(b_cand_n_missing[match_cand_ixs])
                                 if self.regress_flag:
                                     df_list_regressions += list(b_cand_regs[match_cand_ixs])
                                     df_list_rg_bins += list(b_cand_rg_bins[match_cand_ixs])
                                     df_list_rg_uncs += list(b_cand_rg_uncs[match_cand_ixs])
                                     assert len(match_cand_ixs)==len(match_gt_ixs)
                                     df_list_rg_targets += list(b_tar_regs[match_gt_ixs])
                                     df_list_rg_bin_targets += list(b_tar_rg_bins[match_gt_ixs])
                                 df_list_pids += [pid] * match_cand_ixs.shape[0]
                                 df_list_type += ['det_tp'] * match_cand_ixs.shape[0]
                             # rest fp:
                             if not 0 in non_match_cand_ixs.shape:
                                 df_list_preds += list(b_cand_scores[non_match_cand_ixs])
                                 df_list_labels += [0] * non_match_cand_ixs.shape[0]
                                 df_list_class_preds += [cl] * non_match_cand_ixs.shape[0]
                                 df_list_n_missing += list(b_cand_n_missing[non_match_cand_ixs])
                                 if self.regress_flag:
                                     df_list_regressions += list(b_cand_regs[non_match_cand_ixs])
                                     df_list_rg_bins += list(b_cand_rg_bins[non_match_cand_ixs])
                                     df_list_rg_uncs += list(b_cand_rg_uncs[non_match_cand_ixs])
                                     df_list_rg_targets += [[0.]*self.cf.regression_n_features] * non_match_cand_ixs.shape[0]
                                     df_list_rg_bin_targets += [0.] * non_match_cand_ixs.shape[0]
                                 df_list_pids += [pid] * non_match_cand_ixs.shape[0]
                                 df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0]
                             # fn:
                             if not 0 in non_match_gt_ixs.shape:
                                 df_list_preds += [0] * non_match_gt_ixs.shape[0]
                                 df_list_labels += [1] * non_match_gt_ixs.shape[0]
                                 df_list_class_preds += [cl] * non_match_gt_ixs.shape[0]
                                 df_list_n_missing += [np.nan] * non_match_gt_ixs.shape[0]
                                 if self.regress_flag:
                                     df_list_regressions += [[0.]*self.cf.regression_n_features] * non_match_gt_ixs.shape[0]
                                     df_list_rg_bins += [0.] * non_match_gt_ixs.shape[0]
                                     df_list_rg_uncs += [np.nan] * non_match_gt_ixs.shape[0]
                                     df_list_rg_targets += list(b_tar_regs[non_match_gt_ixs])
                                     df_list_rg_bin_targets += list(b_tar_rg_bins[non_match_gt_ixs])
                                 df_list_pids += [pid]  * non_match_gt_ixs.shape[0]
                                 df_list_type += ['det_fn']  * non_match_gt_ixs.shape[0]
                         # only fp:
                         if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
                             # means there is no gt in all samples! any preds have to be fp.
                             df_list_preds += list(b_cand_scores)
                             df_list_labels += [0] * b_cand_boxes.shape[0]
                             df_list_class_preds += [cl] * b_cand_boxes.shape[0]
                             df_list_n_missing += list(b_cand_n_missing)
                             if self.regress_flag:
                                 df_list_regressions += list(b_cand_regs)
                                 df_list_rg_bins += list(b_cand_rg_bins)
                                 df_list_rg_uncs += list(b_cand_rg_uncs)
                                 df_list_rg_targets += [[0.]*self.cf.regression_n_features] * b_cand_boxes.shape[0]
                                 df_list_rg_bin_targets += [0.] * b_cand_boxes.shape[0]
                             df_list_pids += [pid] * b_cand_boxes.shape[0]
                             df_list_type += ['det_fp'] * b_cand_boxes.shape[0]
                         # only fn:
                         if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                             df_list_preds += [0] * b_tar_boxes.shape[0]
                             df_list_labels += [1] * b_tar_boxes.shape[0]
                             df_list_class_preds += [cl] * b_tar_boxes.shape[0]
                             df_list_n_missing += [np.nan] * b_tar_boxes.shape[0]
                             if self.regress_flag:
                                 df_list_regressions += [[0.]*self.cf.regression_n_features] * b_tar_boxes.shape[0]
                                 df_list_rg_bins += [0.] * b_tar_boxes.shape[0]
                                 df_list_rg_uncs += [np.nan] * b_tar_boxes.shape[0]
                                 df_list_rg_targets += list(b_tar_regs)
                                 df_list_rg_bin_targets += list(b_tar_rg_bins)
                             df_list_pids += [pid] * b_tar_boxes.shape[0]
                             df_list_type += ['det_fn'] * b_tar_boxes.shape[0]
 
                     # empty patient with 0 detections needs empty patient score, in order to not disappear from stats.
                     # filtered out for roi-level evaluation later. During training (and val_sampling),
                     # tn are assigned per sample independently of associated patients.
                     # i.e., patient_tn is also meant as sample_tn if a list of samples is evaluated instead of whole patient
                     if len(df_list_pids) == len_df_list_before_patient:
                         df_list_preds += [0]
                         df_list_labels += [0]
                         df_list_class_preds += [cl]
                         df_list_n_missing += [np.nan]
                         if self.regress_flag:
                             df_list_regressions += [[0.]*self.cf.regression_n_features]
                             df_list_rg_bins += [0.]
                             df_list_rg_uncs += [np.nan]
                             df_list_rg_targets += [[0.]*self.cf.regression_n_features]
                             df_list_rg_bin_targets += [0.]
                         df_list_pids += [pid]
                         df_list_type += ['patient_tn'] # true negative: no ground truth boxes, no detections.
 
             df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou))
 
         self.test_df = pd.DataFrame()
         self.test_df['pred_score'] = df_list_preds
         self.test_df['class_label'] = df_list_labels
         # class labels are gt, 0,1, only indicate neg/pos (or bg/fg) remapped from all classes
         self.test_df['pred_class'] = df_list_class_preds # can be diff than 0,1
         self.test_df['pid'] = df_list_pids
         self.test_df['det_type'] = df_list_type
         self.test_df['fold'] = self.cf.fold
         self.test_df['match_iou'] = df_list_match_iou
         self.test_df['cluster_n_missing'] = df_list_n_missing
         if self.regress_flag:
             self.test_df['regressions'] = df_list_regressions
             self.test_df['rg_targets'] = df_list_rg_targets
             self.test_df['rg_uncertainties'] = df_list_rg_uncs
             self.test_df['rg_bins'] = df_list_rg_bins
-            # super weird error: pandas does not properly add an attribute if column is named "rg_bin_targets" ... ?!?
             self.test_df['rg_bin_target'] = df_list_rg_bin_targets
             assert hasattr(self.test_df, "rg_bin_target")
 
-        #fn_df = self.test_df[self.test_df["det_type"] == "det_fn"]
-
-        pass
 
     def evaluate_predictions(self, results_list, monitor_metrics=None):
         """
         Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
         Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
         det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
         pred_class: foreground class which the object predicts.
         pid: corresponding patient-id.
         pred_score: confidence score [0, 1]
         fold: corresponding fold of CV.
         match_iou: utilized IoU for matching.
         :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
         [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
         Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
         :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
         :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
         """
         # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches
         # we want to evaluate one batch_instance (= 2D or 3D image) at a time.
 
 
         self.logger.info('evaluating in mode {}'.format(self.mode))
 
         batch_res_dicts = [batch[0] for batch in results_list]  # len: nr of batches in epoch
         if self.mode == 'train' or self.mode=='val_sampling':
             # one pid per batch element
             # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
             # -> [pid_0, pid_1, ...]
             # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance
             pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]]
         elif self.mode == "val_patient" or self.mode=="test":
             # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...]
             # in patientbatchiterator there is only one pid per batch
             pid_list = [np.unique(batch[1]) for batch in results_list]
             assert np.all([len(pid)==1 for pid in pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(pid_list)
             pid_list = [pid[0] for pid in pid_list]
         else:
             raise Exception("undefined run mode encountered")
 
         self.eval_losses(batch_res_dicts)
         self.eval_segmentations(batch_res_dicts, pid_list)
         self.eval_boxes(batch_res_dicts, pid_list, self.cf.class_dict)
 
         if monitor_metrics is not None:
             # return all_stats, updated monitor_metrics
             return self.return_metrics(self.test_df, self.cf.class_dict, monitor_metrics)
 
     def return_metrics(self, df, obj_cl_dict, monitor_metrics=None, boxes_only=False):
         """
         Calculates metric scores for internal data frame. Called directly from evaluate_predictions during training for
         monitoring, or from score_test_df during inference (for single folds or aggregated test set).
         Loops over foreground classes and score_levels ('roi' and/or 'patient'), gets scores and stores them.
         Optionally creates plots of prediction histograms and ROC/PR curves.
         :param df: Data frame that holds evaluated predictions.
         :param obj_cl_dict: Dict linking object-class ids to object-class names. E.g., {1: "bikes", 2 : "cars"}. Set in
             configs as cf.class_dict.
         :param monitor_metrics: dict of dicts with all metrics of previous epochs. This function adds metrics for
          current epoch and returns the same object.
         :param boxes_only: whether to produce metrics only for the boxes, not the segmentations.
         :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and
         score_level.
         :return: monitor_metrics
         """
 
         # -------------- monitoring independent of class, score level ------------
         if monitor_metrics is not None:
             for l_name in self.epoch_losses:
                 monitor_metrics[l_name] = [self.epoch_losses[l_name]]
 
         # -------------- metrics calc dependent on class, score level ------------
 
         all_stats = [] # all_stats: one entry per score_level per class
         for cl in list(obj_cl_dict.keys()):    # bg eval is neglected
             cl_name = obj_cl_dict[cl]
             cl_df = df[df.pred_class == cl]
             if hasattr(self, "seg_df") and not boxes_only:
                 dice_col = self.cf.seg_id2label[cl].name+"_dice"
                 seg_cl_df = self.seg_df.loc[:,['pid', dice_col, 'fold']]
 
             for score_level in self.cf.report_score_level:
 
                 stats_dict = {}
                 stats_dict['name'] = 'fold_{} {} {}'.format(self.cf.fold, score_level, cl_name)
 
                 # -------------- RoI-based -----------------
                 if score_level == 'rois':
 
                     stats_dict['auc'] = np.nan
                     stats_dict['roc'] = np.nan
 
                     if monitor_metrics is not None:
                         tn = len(cl_df[cl_df.det_type == "patient_tn"])
                         tp = len(cl_df[(cl_df.det_type == "det_tp")&(cl_df.pred_score>self.cf.min_det_thresh)])
                         fp = len(cl_df[(cl_df.det_type == "det_fp")&(cl_df.pred_score>self.cf.min_det_thresh)])
                         fn = len(cl_df[cl_df.det_type == "det_fn"])
                         sens = np.divide(tp, (fn + tp))
                         monitor_metrics.update({"Bin_Stats/" + cl_name + "_fp": [fp], "Bin_Stats/" + cl_name + "_tp": [tp],
                                                  "Bin_Stats/" + cl_name + "_fn": [fn], "Bin_Stats/" + cl_name + "_tn": [tn],
                                                  "Bin_Stats/" + cl_name + "_sensitivity": [sens]})
                         # list wrapping only needed bc other metrics are recorded over all epochs;
 
                     spec_df = cl_df[cl_df.det_type != 'patient_tn']
                     if self.regress_flag:
                         # filter false negatives out for regression-only eval since regressor didn't predict
                         truncd_df = spec_df[(((spec_df.det_type == "det_fp") | (
                                     spec_df.det_type == "det_tp")) & spec_df.pred_score > self.cf.min_det_thresh)]
                         truncd_df_tp = truncd_df[truncd_df.det_type == "det_tp"]
                         weights, weights_tp = truncd_df.pred_score.tolist(), truncd_df_tp.pred_score.tolist()
 
                         y_true, y_pred = truncd_df.rg_targets.tolist(), truncd_df.regressions.tolist()
                         stats_dict["rg_RMSE"] = RMSE(y_true, y_pred)
                         stats_dict["rg_MAE"] = MAE(y_true, y_pred)
                         stats_dict["rg_RMSE_weighted"] = RMSE(y_true, y_pred, weights)
                         stats_dict["rg_MAE_weighted"] = MAE(y_true, y_pred, weights)
                         y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist()
                         stats_dict["rg_MAE_weighted_tp"] = MAE(y_true, y_pred, weights_tp)
                         stats_dict["rg_MAE_w_std_weighted_tp"] = MAE_w_std(y_true, y_pred, weights_tp)
 
                         y_true, y_pred = truncd_df.rg_bin_target.tolist(), truncd_df.rg_bins.tolist()
                         stats_dict["rg_bin_accuracy"] = accuracy(y_true, y_pred)
                         stats_dict["rg_bin_accuracy_weighted"] = accuracy(y_true, y_pred, weights)
 
                         y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist()
                         stats_dict["rg_bin_accuracy_weighted_tp"] = accuracy(y_true, y_pred, weights_tp)
                         if np.any(~truncd_df.rg_uncertainties.isna()):
                             # det_fn are expected to be NaN so they drop out in means
                             stats_dict.update({"rg_uncertainty": truncd_df.rg_uncertainties.mean(),
                                                "rg_uncertainty_tp": truncd_df_tp.rg_uncertainties.mean(),
                                                "rg_uncertainty_tp_weighted": (truncd_df_tp.rg_uncertainties * truncd_df_tp.pred_score).sum()
                                                                              / truncd_df_tp.pred_score.sum()
                                                })
 
                     if (spec_df.class_label==1).any():
                         stats_dict['ap'] = get_roi_ap_from_df((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap))
                         stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                         if self.regress_flag:
                             stats_dict['avp'] = roi_avp((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap))
                     else:
                         stats_dict['ap'] = np.nan
                         stats_dict['prc'] = np.nan
                         stats_dict['avp'] = np.nan
                         # np.nan is formattable by __format__ as a float, None-type is not
 
                     if hasattr(self, "seg_df") and not boxes_only:
                         stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() # mean per all rois in this epoch
                         stats_dict["dice_std"] = seg_cl_df.loc[:,dice_col].std()
 
                     # for the aggregated test set case, additionally get the scores of averaging over fold results.
                     if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1:
                         aps = []
                         for fold in df.fold.unique():
                             fold_df = spec_df[spec_df.fold == fold]
                             if (fold_df.class_label==1).any():
                                 aps.append(get_roi_ap_from_df((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap)))
 
                         stats_dict['ap_folds_mean'] = np.mean(aps) if len(aps)>0 else np.nan
                         stats_dict['ap_folds_std'] = np.std(aps) if len(aps)>0 else np.nan
                         stats_dict['auc_folds_mean'] = np.nan
                         stats_dict['auc_folds_std'] = np.nan
                         if self.regress_flag:
                             avps, accuracies, MAEs = [], [], []
                             for fold in df.fold.unique():
                                 fold_df = spec_df[spec_df.fold == fold]
                                 if (fold_df.class_label == 1).any():
                                     avps.append(roi_avp((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap)))
                                 truncd_df_tp = fold_df[((fold_df.det_type == "det_tp") & fold_df.pred_score > self.cf.min_det_thresh)]
                                 weights_tp = truncd_df_tp.pred_score.tolist()
                                 y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist()
                                 accuracies.append(accuracy(y_true, y_pred, weights_tp))
                                 y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist()
                                 MAEs.append(MAE_w_std(y_true, y_pred, weights_tp))
 
                             stats_dict['avp_folds_mean'] = np.mean(avps) if len(avps) > 0 else np.nan
                             stats_dict['avp_folds_std'] = np.std(avps) if len(avps) > 0 else np.nan
                             stats_dict['rg_bin_accuracy_weighted_tp_folds_mean'] = np.mean(accuracies) if len(accuracies) > 0 else np.nan
                             stats_dict['rg_bin_accuracy_weighted_tp_folds_std'] = np.std(accuracies) if len(accuracies) > 0 else np.nan
                             stats_dict['rg_MAE_w_std_weighted_tp_folds_mean'] = np.mean(MAEs, axis=0) if len(MAEs) > 0 else np.nan
                             stats_dict['rg_MAE_w_std_weighted_tp_folds_std'] = np.std(MAEs, axis=0) if len(MAEs) > 0 else np.nan
 
                     if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1:
                         fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"})
                         stats_dict["dice_folds_mean"] = float(fold_means.mean())
                         stats_dict["dice_folds_std"] = float(fold_means.std())
 
                 # -------------- patient-based -----------------
                 # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest
                 # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0.
                 if score_level == 'patient':
                     #this is the critical part in patient scoring: only the max gt and max pred score are taken per patient!
                     #--> does mix up values from separate detections
                     spec_df = cl_df.groupby(['pid'], as_index=False)
                     agg_args = {'class_label': 'max', 'pred_score': 'max', 'fold': 'first'}
                     if self.regress_flag:
                         # pandas throws error if aggregated value is np.array, not if is list.
                         agg_args.update({'regressions': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]),
                                          'rg_targets': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]),
                                          'rg_bins': 'max', 'rg_bin_target': 'max',
                                          'rg_uncertainties': 'max'
                                          })
                     if hasattr(cl_df, "cluster_n_missing"):
                         agg_args.update({'cluster_n_missing': 'mean'})
                     spec_df = spec_df.agg(agg_args)
 
                     if len(spec_df.class_label.unique()) > 1:
                         stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                         stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
                     else:
                         stats_dict['auc'] = np.nan
                         stats_dict['roc'] = np.nan
 
                     if (spec_df.class_label == 1).any():
                         patient_cl_labels = spec_df.class_label.tolist()
                         stats_dict['ap'] = average_precision_score(patient_cl_labels, spec_df.pred_score.tolist())
                         stats_dict['prc'] = precision_recall_curve(patient_cl_labels, spec_df.pred_score.tolist())
                         if self.regress_flag:
                             avp_scores = spec_df[spec_df.rg_bins == spec_df.rg_bin_target].pred_score.tolist()
                             avp_scores += [0.] * (len(patient_cl_labels) - len(avp_scores))
                             stats_dict['avp'] = average_precision_score(patient_cl_labels, avp_scores)
                     else:
                         stats_dict['ap'] = np.nan
                         stats_dict['prc'] = np.nan
                         stats_dict['avp'] = np.nan
                     if self.regress_flag:
                         y_true, y_pred = spec_df.rg_targets.tolist(), spec_df.regressions.tolist()
                         stats_dict["rg_RMSE"] = RMSE(y_true, y_pred)
                         stats_dict["rg_MAE"] = MAE(y_true, y_pred)
                         stats_dict["rg_bin_accuracy"] = accuracy(spec_df.rg_bin_target.tolist(), spec_df.rg_bins.tolist())
                         stats_dict["rg_uncertainty"] = spec_df.rg_uncertainties.mean()
                     if hasattr(self, "seg_df") and not boxes_only:
                         seg_cl_df = seg_cl_df.groupby(['pid'], as_index=False).agg(
                             {dice_col: "mean", "fold": "first"})  # mean of all rois per patient in this epoch
                         stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() #mean of all patients
                         stats_dict["dice_std"] = seg_cl_df.loc[:, dice_col].std()
 
 
                     # for the aggregated test set case, additionally get the scores for averaging over fold results.
                     if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1 and self.mode in ["test", "analysis"]:
                         aucs = []
                         aps = []
                         for fold in df.fold.unique():
                             fold_df = spec_df[spec_df.fold == fold]
                             if (fold_df.class_label==1).any():
                                 aps.append(
                                     average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
                             if len(fold_df.class_label.unique())>1:
                                 aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
                         stats_dict['auc_folds_mean'] = np.mean(aucs)
                         stats_dict['auc_folds_std'] = np.std(aucs)
                         stats_dict['ap_folds_mean'] = np.mean(aps)
                         stats_dict['ap_folds_std'] = np.std(aps)
                     if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1:
                         fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"})
                         stats_dict["dice_folds_mean"] = float(fold_means.mean())
                         stats_dict["dice_folds_std"] = float(fold_means.std())
 
                 all_stats.append(stats_dict)
 
                 # -------------- monitoring, visualisation -----------------
                 # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level.
                 patient_interests = [self.cf.class_dict[self.cf.patient_class_of_interest],]
                 if hasattr(self.cf, "bin_dict"):
                     patient_interests += [self.cf.bin_dict[self.cf.patient_bin_of_interest]]
                 if monitor_metrics is not None and (score_level != 'patient' or cl_name in patient_interests):
                     name = 'patient_'+cl_name if score_level == 'patient' else cl_name
                     for metric in self.cf.metrics:
                         if metric in stats_dict.keys():
                             monitor_metrics[name + '_'+metric].append(stats_dict[metric])
                         else:
                             print("WARNING: skipped monitor metric {}_{} since not avail".format(name, metric))
 
                 # histograms
                 if self.cf.plot_prediction_histograms:
                     out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_{}'.format(
                             self.cf.fold, self.mode, score_level, cl_name))
                     plg.plot_prediction_hist(self.cf, spec_df, out_filename)
 
                 # analysis of the  hyper-parameter cf.min_det_thresh, for optimization on validation set.
                 if self.cf.scan_det_thresh and "val" in self.mode:
                     conf_threshs = list(np.arange(0.8, 1, 0.02))
                     pool = Pool(processes=self.cf.n_workers)
                     mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs]
                     aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1)
                     pool.close()
                     pool.join()
                     self.logger.info('results from scanning over det_threshs: {}'.format([[i, j] for i, j in zip(conf_threshs, aps)]))
 
         if self.cf.plot_stat_curves:
             out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode))
             plg.plot_stat_curves(self.cf, all_stats, out_filename)
         if self.cf.plot_prediction_histograms and hasattr(df, "cluster_n_missing") and df.cluster_n_missing.notna().any():
             out_filename = os.path.join(self.hist_dir, 'n_missing_hist_{}_{}.png'.format(self.cf.fold, self.mode))
             plg.plot_wbc_n_missing(self.cf, df, outfile=out_filename)
 
         return all_stats, monitor_metrics
 
 
     def score_test_df(self, max_fold=None, internal_df=True):
         """
         Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold,
         gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of
         all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds,
         and calculates and writes out overall metrics.
         """
         # this should maybe be extended to auc, ap stds.
         metrics_to_score = self.cf.metrics # + [ m+ext for m in self.cf.metrics if "dice" in m for ext in ["_std"]]
 
         if internal_df:
 
             self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pkl'.format(self.cf.fold)))
             if hasattr(self, "seg_df"):
                 self.seg_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_seg_df.pkl'.format(self.cf.fold)))
             stats, _ = self.return_metrics(self.test_df, self.cf.class_dict)
 
             with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
                 handle.write('\n****************************\n')
                 handle.write('\nresults for fold {}, {} \n'.format(self.cf.fold, time.strftime("%d/%m/%y %H:%M:%S")))
                 handle.write('\n****************************\n')
                 handle.write('\nfold df shape {}\n  \n'.format(self.test_df.shape))
                 for s in stats:
                     for metric in metrics_to_score:
                         if metric in s.keys():  #needed as long as no dice on patient level poss
                             if "accuracy" in metric:
                                 handle.write('{} {:0.4f}  '.format(metric, s[metric]))
                             else:
                                 handle.write('{} {:0.3f}  '.format(metric, s[metric]))
                         else:
                             print("WARNING: skipped metric {} since not avail".format(metric))
                     handle.write('{} \n'.format(s['name']))
 
         fold_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_df.pkl' in ii])
         fold_seg_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_seg_df.pkl' in ii])
         for paths in [fold_df_paths, fold_seg_df_paths]:
             assert len(paths)<= self.cf.n_cv_splits, "found {} > nr of cv splits results dfs in {}".format(len(paths), self.cf.test_dir)
         if max_fold is None:
             max_fold = self.cf.n_cv_splits-1
         if self.cf.fold == max_fold:
             print("max fold/overall stats triggered")
             if self.cf.evaluate_fold_means:
                 metrics_to_score += [m + ext for m in self.cf.metrics for ext in ("_folds_mean", "_folds_std")]
 
             with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
 
                 self.cf.fold = 'overall'
                 dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths]
                 seg_dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_seg_df_paths]
 
                 self.test_df = pd.concat(dfs_list, sort=True)
                 if len(seg_dfs_list)>0:
                     self.seg_df = pd.concat(seg_dfs_list, sort=True)
                 stats, _ = self.return_metrics(self.test_df, self.cf.class_dict)
 
                 handle.write('\n****************************\n')
                 handle.write('\nOVERALL RESULTS \n')
                 handle.write('\n****************************\n')
                 handle.write('\ndf shape \n  \n'.format(self.test_df.shape))
                 for s in stats:
                     for metric in metrics_to_score:
                         if metric in s.keys():
                             handle.write('{} {:0.3f}  '.format(metric, s[metric]))
                     handle.write('{} \n'.format(s['name']))
 
             results_table_path = os.path.join(self.cf.test_dir,"../../", 'results_table.csv')
             with open(results_table_path, 'a') as handle:
                 #---column headers---
                 handle.write('\n{},'.format("Experiment Name"))
                 handle.write('{},'.format("Time Stamp"))
                 handle.write('{},'.format("Samples Seen"))
                 handle.write('{},'.format("Spatial Dim"))
                 handle.write('{},'.format("Patch Size"))
                 handle.write('{},'.format("CV Folds"))
                 handle.write('{},'.format("{}-clustering IoU".format(self.cf.clustering)))
                 handle.write('{},'.format("Merge-2D-to-3D IoU"))
                 if hasattr(self.cf, "test_against_exact_gt"):
                     handle.write('{},'.format('Exact GT'))
                 for s in stats:
                     assert "overall" in s['name'].split(" ")[0]
                     if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name']:
                         for metric in metrics_to_score:
                             if metric in s.keys() and not np.isnan(s[metric]):
                                 if metric=='ap':
                                     handle.write('{}_{} : {}_{},'.format(*s['name'].split(" ")[1:], metric, int(np.mean(self.cf.ap_match_ious)*100)))
                                 elif not "folds_std" in metric:
                                     handle.write('{}_{} : {},'.format(*s['name'].split(" ")[1:], metric))
                             else:
                                 print("WARNING: skipped metric {} since not avail".format(metric))
                 handle.write('\n')
 
                 #--- columns content---
                 handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1]))
                 handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S")))
                 handle.write('{},'.format(self.cf.num_epochs*self.cf.num_train_batches*self.cf.batch_size))
                 handle.write('{}D,'.format(self.cf.dim))
                 handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)])))
                 handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", "")))
                 handle.write('{},'.format(self.cf.clustering_iou if self.cf.clustering else str("N/A")))
                 handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A")))
                 if hasattr(self.cf, "test_against_exact_gt"):
                     handle.write('{},'.format(self.cf.test_against_exact_gt))
                 for s in stats:
                     if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name']:
                         for metric in metrics_to_score:
                             if metric in s.keys() and not np.isnan(s[metric]): # needed as long as no dice on patient level possible
                                 if "folds_mean" in metric:
                                     handle.write('{:0.3f}\u00B1{:0.3f}, '.format(s[metric], s["_".join((*metric.split("_")[:-1], "std"))]))
                                 elif not "folds_std" in metric:
                                     handle.write('{:0.3f}, '.format(s[metric]))
 
                 handle.write('\n')
 
             with open(os.path.join(self.cf.test_dir, 'results_extr_scores.txt'), 'w') as handle:
                 handle.write('\n****************************\n')
                 handle.write('\nextremal scores for fold {} \n'.format(self.cf.fold))
                 handle.write('\n****************************\n')
                 # want: pid & fold (&other) of highest scoring tp & fp in test_df
                 for cl in self.cf.class_dict.keys():
                     print("\nClass {}".format(self.cf.class_dict[cl]), file=handle)
                     cl_df = self.test_df[self.test_df.pred_class == cl] #.dropna(axis=1)
                     for det_type in ['det_tp', 'det_fp']:
                         filtered_df = cl_df[cl_df.det_type==det_type]
                         print("\nHighest scoring {} of class {}".format(det_type, self.cf.class_dict[cl]), file=handle)
                         if len(filtered_df)>0:
                             print(filtered_df.loc[filtered_df.pred_score.idxmax()], file=handle)
                         else:
                             print("No detections of type {} for class {} in this df".format(det_type, self.cf.class_dict[cl]), file=handle)
                     handle.write('\n****************************\n')
diff --git a/unittests.py b/unittests.py
index 3b6c7a1..87c5f2c 100644
--- a/unittests.py
+++ b/unittests.py
@@ -1,625 +1,625 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import unittest
 
 import os
 import pickle
 import time
 from multiprocessing import  Pool
 import subprocess
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
 import torch
 import torchvision as tv
 
 import tqdm
 
 import plotting as plg
 import utils.exp_utils as utils
 import utils.model_utils as mutils
 
 """ Note on unittests: run this file either in the way intended for unittests by starting the script with
     python -m unittest unittests.py or start it as a normal python file as python unittests.py.
     You can selective run single tests by calling python -m unittest unittests.TestClassOfYourChoice, where 
     TestClassOfYourChoice is the name of the test defined below, e.g., CompareFoldSplits.
 """
 
 
 
 def inspect_info_df(pp_dir):
     """ use your debugger to look into the info df of a pp dir.
     :param pp_dir: preprocessed-data directory
     """
 
     info_df = pd.read_pickle(os.path.join(pp_dir, "info_df.pickle"))
 
     return
 
 
 def generate_boxes(count, dim=2, h=100, w=100, d=20, normalize=False, on_grid=False, seed=0):
     """ generate boxes of format [y1, x1, y2, x2, (z1, z2)].
     :param count: nr of boxes
     :param dim: dimension of boxes (2 or 3)
     :return: boxes in format (n_boxes, 4 or 6), scores
     """
     np.random.seed(seed)
     if on_grid:
         lower_y = np.random.randint(0, h // 2, (count,))
         lower_x = np.random.randint(0, w // 2, (count,))
         upper_y = np.random.randint(h // 2, h, (count,))
         upper_x = np.random.randint(w // 2, w, (count,))
         if dim == 3:
             lower_z = np.random.randint(0, d // 2, (count,))
             upper_z = np.random.randint(d // 2, d, (count,))
     else:
         lower_y = np.random.rand(count) * h / 2.
         lower_x = np.random.rand(count) * w / 2.
         upper_y = (np.random.rand(count) + 1.) * h / 2.
         upper_x = (np.random.rand(count) + 1.) * w / 2.
         if dim == 3:
             lower_z = np.random.rand(count) * d / 2.
             upper_z = (np.random.rand(count) + 1.) * d / 2.
 
     if dim == 3:
         boxes = np.array(list(zip(lower_y, lower_x, upper_y, upper_x, lower_z, upper_z)))
         # add an extreme box that tests the boundaries
         boxes = np.concatenate((boxes, np.array([[0., 0., h, w, 0, d]])))
     else:
         boxes = np.array(list(zip(lower_y, lower_x, upper_y, upper_x)))
         boxes = np.concatenate((boxes, np.array([[0., 0., h, w]])))
 
     scores = np.random.rand(count + 1)
     if normalize:
         divisor = np.array([h, w, h, w, d, d]) if dim == 3 else np.array([h, w, h, w])
         boxes = boxes / divisor
     return boxes, scores
 
 #------- perform integrity checks on data set(s) -----------
 class VerifyLIDCSAIntegrity(unittest.TestCase):
     """ Perform integrity checks on preprocessed single-annotator GTs of LIDC data set.
     """
     @staticmethod
     def check_patient_sa_gt(pid, pp_dir, check_meta_files, check_info_df):
 
         faulty_cases = pd.DataFrame(columns=['pid', 'rater', 'cl_targets', 'roi_ids'])
 
         all_segs = np.load(os.path.join(pp_dir, pid + "_rois.npz"), mmap_mode='r')
         all_segs = all_segs[list(all_segs.keys())[0]]
         all_roi_ids = np.unique(all_segs[all_segs > 0])
         assert len(all_roi_ids) == np.max(all_segs), "roi ids not consecutive"
         if check_meta_files:
             meta_file = os.path.join(pp_dir, pid + "_meta_info.pickle")
             with open(meta_file, "rb") as handle:
                 info = pickle.load(handle)
             assert info["pid"] == pid, "wrong pid in meta_file"
             all_cl_targets = info["class_target"]
         if check_info_df:
             info_df = pd.read_pickle(os.path.join(pp_dir, "info_df.pickle"))
             pid_info = info_df[info_df.pid == pid]
             assert len(pid_info) == 1, "found {} entries for pid {} in info df, expected exactly 1".format(len(pid_info),
                                                                                                            pid)
             if check_meta_files:
                 assert pid_info[
                            "class_target"] == all_cl_targets, "meta_info and info_df class targets mismatch:\n{}\n{}".format(
                     pid_info["class_target"], all_cl_targets)
             all_cl_targets = pid_info["class_target"].iloc[0]
         assert len(all_roi_ids) == len(all_cl_targets)
         for rater in range(4):
             seg = all_segs[rater]
             roi_ids = np.unique(seg[seg > 0])
             cl_targs = np.array([roi[rater] for roi in all_cl_targets])
             assert np.count_nonzero(cl_targs) == len(roi_ids), "rater {} has targs {} but roi ids {}".format(rater, cl_targs, roi_ids)
             assert len(cl_targs) >= len(roi_ids), "not all marked rois have a label"
             for zeroix_roi_id, rating in enumerate(cl_targs):
                 if not ((rating > 0) == (np.any(seg == zeroix_roi_id + 1))):
                     print("\n\nFAULTY CASE:", end=" ", )
                     print("pid {}, rater {}, cl_targs {}, ids {}\n".format(pid, rater, cl_targs, roi_ids))
                     faulty_cases = faulty_cases.append(
                         {'pid': pid, 'rater': rater, 'cl_targets': cl_targs, 'roi_ids': roi_ids}, ignore_index=True)
         print("finished checking pid {}, {} faulty cases".format(pid, len(faulty_cases)))
         return faulty_cases
 
     def check_sa_gts(cf, pp_dir, pid_subset=None, check_meta_files=False, check_info_df=True, processes=os.cpu_count()):
         report_name = "verify_seg_label_pairings.csv"
         pids = {file_name.split("_")[0] for file_name in os.listdir(pp_dir) if file_name not in [report_name, "info_df.pickle"]}
         if pid_subset is not None:
             pids = [pid for pid in pids if pid in pid_subset]
 
 
         faulty_cases = pd.DataFrame(columns=['pid', 'rater', 'cl_targets', 'roi_ids'])
 
         p = Pool(processes=processes)
         mp_args = zip(pids, [pp_dir]*len(pids), [check_meta_files]*len(pids), [check_info_df]*len(pids))
         patient_cases = p.starmap(self.check_patient_sa_gt, mp_args)
         p.close(); p.join()
         faulty_cases = faulty_cases.append(patient_cases, sort=False)
 
 
         print("\n\nfaulty case count {}".format(len(faulty_cases)))
         print(faulty_cases)
         findings_file = os.path.join(pp_dir, "verify_seg_label_pairings.csv")
         faulty_cases.to_csv(findings_file)
 
         assert len(faulty_cases)==0, "there was a faulty case in data set {}.\ncheck {}".format(pp_dir, findings_file)
 
     def test(self):
         pp_root = "/media/gregor/HDD2TB/Documents/data/"
         pp_dir = "lidc/pp_20190805"
         gt_dir = os.path.join(pp_root, pp_dir, "patient_gts_sa")
         self.check_sa_gts(gt_dir, check_meta_files=True, check_info_df=False, pid_subset=None)  # ["0811a", "0812a"])
 
 #------ compare segmentation gts of preprocessed data sets ------
 class CompareSegGTs(unittest.TestCase):
     """ load and compare pre-processed gts by dice scores of segmentations.
 
     """
     @staticmethod
     def group_seg_paths(ref_path, comp_paths):
         # not working recursively
         ref_files = [fn for fn in os.listdir(ref_path) if
                      os.path.isfile(os.path.join(ref_path, fn)) and 'seg' in fn and fn.endswith('.npy')]
 
         comp_files = [[os.path.join(c_path, fn) for c_path in comp_paths] for fn in ref_files]
 
         ref_files = [os.path.join(ref_path, fn) for fn in ref_files]
 
         return zip(ref_files, comp_files)
 
     @staticmethod
     def load_calc_dice(paths):
         dices = []
         ref_seg = np.load(paths[0])[np.newaxis, np.newaxis]
         n_classes = len(np.unique(ref_seg))
         ref_seg = mutils.get_one_hot_encoding(ref_seg, n_classes)
 
         for c_file in paths[1]:
             c_seg = np.load(c_file)[np.newaxis, np.newaxis]
             assert n_classes == len(np.unique(c_seg)), "unequal nr of objects/classes betw segs {} {}".format(paths[0],
                                                                                                               c_file)
             c_seg = mutils.get_one_hot_encoding(c_seg, n_classes)
 
             dice = mutils.dice_per_batch_inst_and_class(c_seg, ref_seg, n_classes, convert_to_ohe=False)
             dices.append(dice)
         print("processed ref_path {}".format(paths[0]))
         return np.mean(dices), np.std(dices)
 
     def iterate_files(self, grouped_paths, processes=os.cpu_count()):
         p = Pool(processes)
 
         means_stds = np.array(p.map(self.load_calc_dice, grouped_paths))
 
         p.close(); p.join()
         min_dice = np.min(means_stds[:, 0])
         print("min mean dice {:.2f}, max std {:.4f}".format(min_dice, np.max(means_stds[:, 1])))
         assert min_dice > 1-1e5, "compared seg gts have insufficient minimum mean dice overlap of {}".format(min_dice)
 
     def test(self):
         ref_path = '/media/gregor/HDD2TB/Documents/data/prostate/data_t2_250519_ps384_gs6071'
         comp_paths = ['/media/gregor/HDD2TB/Documents/data/prostate/data_t2_190419_ps384_gs6071', ]
         paths = self.group_seg_paths(ref_path, comp_paths)
         self.iterate_files(paths)
 
 #------- check if cross-validation fold splits of different experiments are identical ----------
 class CompareFoldSplits(unittest.TestCase):
     """ Find evtl. differences in cross-val file splits across different experiments.
     """
     @staticmethod
     def group_id_paths(ref_exp_dir, comp_exp_dirs):
 
         f_name = 'fold_ids.pickle'
 
         ref_paths = os.path.join(ref_exp_dir, f_name)
         assert os.path.isfile(ref_paths), "ref file {} does not exist.".format(ref_paths)
 
 
         ref_paths = [ref_paths for comp_ed in comp_exp_dirs]
         comp_paths = [os.path.join(comp_ed, f_name) for comp_ed in comp_exp_dirs]
 
         return zip(ref_paths, comp_paths)
 
     @staticmethod
     def comp_fold_ids(mp_input):
         fold_ids1, fold_ids2 = mp_input
         with open(fold_ids1, 'rb') as f:
             fold_ids1 = pickle.load(f)
         try:
             with open(fold_ids2, 'rb') as f:
                 fold_ids2 = pickle.load(f)
         except FileNotFoundError:
             print("comp file {} does not exist.".format(fold_ids2))
             return
 
         n_splits = len(fold_ids1)
         assert n_splits == len(fold_ids2), "mismatch n splits: ref has {}, comp {}".format(n_splits, len(fold_ids2))
         split_diffs = [np.setdiff1d(fold_ids1[s], fold_ids2[s]) for s in range(n_splits)]
         all_equal = np.any(split_diffs)
         return (split_diffs, all_equal)
 
     def iterate_exp_dirs(self, ref_exp, comp_exps, processes=os.cpu_count()):
 
         grouped_paths = list(self.group_id_paths(ref_exp, comp_exps))
         print("performing {} comparisons of cross-val file splits".format(len(grouped_paths)))
         p = Pool(processes)
         split_diffs = p.map(self.comp_fold_ids, grouped_paths)
         p.close(); p.join()
 
         df = pd.DataFrame(index=range(0,len(grouped_paths)), columns=["ref", "comp", "all_equal"])#, "diffs"])
         for ix, (ref, comp) in enumerate(grouped_paths):
             df.iloc[ix] = [ref, comp, split_diffs[ix][1]]#, split_diffs[ix][0]]
 
         print("Any splits not equal?", df.all_equal.any())
         assert not df.all_equal.any(), "a split set is different from reference split set, {}".format(df[~df.all_equal])
 
     def test(self):
         exp_parent_dir = '/home/gregor/networkdrives/E132-Cluster-Projects/prostate/experiments/'
         ref_exp = '/home/gregor/networkdrives/E132-Cluster-Projects/prostate/experiments/gs6071_detfpn2d_cl_bs10'
         comp_exps = [os.path.join(exp_parent_dir, p) for p in os.listdir(exp_parent_dir)]
         comp_exps = [p for p in comp_exps if os.path.isdir(p) and p != ref_exp]
         self.iterate_exp_dirs(ref_exp, comp_exps)
 
 
 #------- check if cross-validation fold splits of a single experiment are actually incongruent (as required) ----------
 class VerifyFoldSplits(unittest.TestCase):
     """ Check, for a single fold_ids file, i.e., for a single experiment, if the assigned folds (assignment of data
-        identifiers) is actually incongruent. No overlaps between folds are required for a correct cross validation.
+        identifiers) is actually incongruent. No overlaps between folds are allowed for a correct cross validation.
     """
     @staticmethod
     def verify_fold_ids(splits):
         for i, split1 in enumerate(splits):
             for j, split2 in enumerate(splits):
                 if j > i:
                     inter = np.intersect1d(split1, split2)
                     if len(inter) > 0:
                         raise Exception("Split {} and {} intersect by pids {}".format(i, j, inter))
     def test(self):
         exp_dir = "/home/gregor/Documents/medicaldetectiontoolkit/datasets/lidc/experiments/dev"
         check_file = os.path.join(exp_dir, 'fold_ids.pickle')
         with open(check_file, 'rb') as handle:
             splits = pickle.load(handle)
         self.verify_fold_ids(splits)
 
 # -------- check own nms CUDA implement against own numpy implement ------
 class CheckNMSImplementation(unittest.TestCase):
 
     @staticmethod
     def assert_res_equality(keep_ics1, keep_ics2, boxes, scores, tolerance=0, names=("res1", "res2")):
         """
         :param keep_ics1: keep indices (results), torch.Tensor of shape (n_ics,)
         :param keep_ics2:
         :return:
         """
         keep_ics1, keep_ics2 = keep_ics1.cpu().numpy(), keep_ics2.cpu().numpy()
         discrepancies = np.setdiff1d(keep_ics1, keep_ics2)
         try:
             checks = np.array([
                 len(discrepancies) <= tolerance
             ])
         except:
             checks = np.zeros((1,)).astype("bool")
         msgs = np.array([
             """{}: {} \n{}: {} \nboxes: {}\n {}\n""".format(names[0], keep_ics1, names[1], keep_ics2, boxes,
                                                             scores)
         ])
 
         assert np.all(checks), "NMS: results mismatch: " + "\n".join(msgs[~checks])
 
     def single_case(self, count=20, dim=3, threshold=0.2, seed=0):
         boxes, scores = generate_boxes(count, dim, seed=seed, h=320, w=280, d=30)
 
         keep_numpy = torch.tensor(mutils.nms_numpy(boxes, scores, threshold))
 
         # for some reason torchvision nms requires box coords as floats.
         boxes = torch.from_numpy(boxes).type(torch.float32)
         scores = torch.from_numpy(scores).type(torch.float32)
         if dim == 2:
             """need to wait until next pytorch release where they fixed nms on cpu (currently they have >= where it
             needs to be >.
             """
             keep_ops = tv.ops.nms(boxes, scores, threshold)
             # self.assert_res_equality(keep_numpy, keep_ops, boxes, scores, tolerance=0, names=["np", "ops"])
             pass
 
         boxes = boxes.cuda()
         scores = scores.cuda()
         keep = self.nms_ext.nms(boxes, scores, threshold)
         self.assert_res_equality(keep_numpy, keep, boxes, scores, tolerance=0, names=["np", "cuda"])
 
     def test(self, n_cases=200, box_count=30, threshold=0.5):
         # dynamically import module so that it doesn't affect other tests if import fails
         self.nms_ext = utils.import_module("nms_ext", 'custom_extensions/nms/nms.py')
         # change seed to something fix if you want exactly reproducible test
         seed0 = np.random.randint(50)
         print("NMS test progress (done/total box configurations) 2D:", end="\n")
         for i in tqdm.tqdm(range(n_cases)):
             self.single_case(count=box_count, dim=2, threshold=threshold, seed=seed0+i)
         print("NMS test progress (done/total box configurations) 3D:", end="\n")
         for i in tqdm.tqdm(range(n_cases)):
             self.single_case(count=box_count, dim=3, threshold=threshold, seed=seed0+i)
 
         return
 
 class CheckRoIAlignImplementation(unittest.TestCase):
 
     def prepare(self, dim=2):
 
         b, c, h, w = 1, 3, 50, 50
         # feature map, (b, c, h, w(, z))
         if dim == 2:
             fmap = torch.rand(b, c, h, w).cuda()
             # rois = torch.tensor([[
             #     [0.1, 0.1, 0.3, 0.3],
             #     [0.2, 0.2, 0.4, 0.7],
             #     [0.5, 0.7, 0.7, 0.9],
             # ]]).cuda()
             pool_size = (7, 7)
             rois = generate_boxes(5, dim=dim, h=h, w=w, on_grid=True, seed=np.random.randint(50))[0]
         elif dim == 3:
             d = 20
             fmap = torch.rand(b, c, h, w, d).cuda()
             # rois = torch.tensor([[
             #     [0.1, 0.1, 0.3, 0.3, 0.1, 0.1],
             #     [0.2, 0.2, 0.4, 0.7, 0.2, 0.4],
             #     [0.5, 0.0, 0.7, 1.0, 0.4, 0.5],
             #     [0.0, 0.0, 0.9, 1.0, 0.0, 1.0],
             # ]]).cuda()
             pool_size = (7, 7, 3)
             rois = generate_boxes(5, dim=dim, h=h, w=w, d=d, on_grid=True, seed=np.random.randint(50),
                                   normalize=False)[0]
         else:
             raise ValueError("dim needs to be 2 or 3")
 
         rois = [torch.from_numpy(rois).type(dtype=torch.float32).cuda(), ]
         fmap.requires_grad_(True)
         return fmap, rois, pool_size
 
     def check_2d(self):
         """ check vs torchvision ops not possible as on purpose different approach.
         :return:
         """
         raise NotImplementedError
         # fmap, rois, pool_size = self.prepare(dim=2)
         # ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1., sampling_ratio=-1)
         # align_ext = ra_object(fmap, rois)
         # loss_ext = align_ext.sum()
         # loss_ext.backward()
         #
         # rois_swapped = [rois[0][:, [1,3,0,2]]]
         # align_ops = tv.ops.roi_align(fmap, rois_swapped, pool_size)
         # loss_ops = align_ops.sum()
         # loss_ops.backward()
         #
         # assert (loss_ops == loss_ext), "sum of roialign ops and extension 2D diverges"
         # assert (align_ops == align_ext).all(), "ROIAlign failed 2D test"
 
     def check_3d(self):
         fmap, rois, pool_size = self.prepare(dim=3)
         ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1., sampling_ratio=-1)
         align_ext = ra_object(fmap, rois)
         loss_ext = align_ext.sum()
         loss_ext.backward()
 
         align_np = mutils.roi_align_3d_numpy(fmap.cpu().detach().numpy(), [roi.cpu().numpy() for roi in rois],
                                              pool_size)
         align_np = np.squeeze(align_np)  # remove singleton batch dim
 
         align_ext = align_ext.cpu().detach().numpy()
         assert np.allclose(align_np, align_ext, rtol=1e-5,
                            atol=1e-8), "RoIAlign differences in numpy and CUDA implement"
 
     def specific_example_check(self):
         # dummy input
         self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py')
         exp = 6
         pool_size = (2,2)
         fmap = torch.arange(exp**2).view(exp,exp).unsqueeze(0).unsqueeze(0).cuda().type(dtype=torch.float32)
 
         boxes = torch.tensor([[1., 1., 5., 5.]]).cuda()/exp
         ind = torch.tensor([0.]*len(boxes)).cuda().type(torch.float32)
         y_exp, x_exp = fmap.shape[2:]  # exp = expansion
         boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp], dtype=torch.float32).cuda())
         boxes = torch.cat((ind.unsqueeze(1), boxes), dim=1)
         aligned_tv = tv.ops.roi_align(fmap, boxes, output_size=pool_size, sampling_ratio=-1)
         aligned = self.ra_ext.roi_align_2d(fmap, boxes, output_size=pool_size, sampling_ratio=-1)
 
         boxes_3d = torch.cat((boxes, torch.tensor([[-1.,1.]]*len(boxes)).cuda()), dim=1)
         fmap_3d = fmap.unsqueeze(dim=-1)
         pool_size = (*pool_size,1)
         ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1.,)
         aligned_3d = ra_object(fmap_3d, boxes_3d)
 
         # expected_res = torch.tensor([[[[10.5000, 12.5000], # this would be with an alternative grid-point setting
         #                                [22.5000, 24.5000]]]]).cuda()
         expected_res = torch.tensor([[[[14., 16.],
                                        [26., 28.]]]]).cuda()
         expected_res_3d = torch.tensor([[[[[14.],[16.]],
                                           [[26.],[28.]]]]]).cuda()
         assert torch.all(aligned==expected_res), "2D RoIAlign check vs. specific example failed. res: {}\n expected: {}\n".format(aligned, expected_res)
         assert torch.all(aligned_3d==expected_res_3d), "3D RoIAlign check vs. specific example failed. res: {}\n expected: {}\n".format(aligned_3d, expected_res_3d)
 
     def manual_check(self):
         """ print examples from a toy batch to file.
         :return:
         """
         self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py')
         # actual mrcnn mask input
         from datasets.toy import configs
         cf = configs.Configs()
         cf.exp_dir = "datasets/toy/experiments/dev/"
         cf.plot_dir = cf.exp_dir + "plots"
         os.makedirs(cf.exp_dir, exist_ok=True)
         cf.fold = 0
         cf.n_workers = 1
         logger = utils.get_logger(cf.exp_dir)
         data_loader = utils.import_module('data_loader', os.path.join("datasets", "toy", 'data_loader.py'))
         batch_gen = data_loader.get_train_generators(cf, logger=logger)
         batch = next(batch_gen['train'])
         roi_mask = np.zeros((1, 320, 200))
         bb_target = (np.array([50, 40, 90, 120])).astype("int")
         roi_mask[:, bb_target[0]+1:bb_target[2]+1, bb_target[1]+1:bb_target[3]+1] = 1.
         #batch = {"roi_masks": np.array([np.array([roi_mask, roi_mask]), np.array([roi_mask])]), "bb_target": [[bb_target, bb_target + 25], [bb_target-20]]}
         #batch_boxes_cor = [torch.tensor(batch_el_boxes).cuda().float() for batch_el_boxes in batch_cor["bb_target"]]
         batch_boxes = [torch.tensor(batch_el_boxes).cuda().float() for batch_el_boxes in batch["bb_target"]]
         #import IPython; IPython.embed()
         for b in range(len(batch_boxes)):
             roi_masks = batch["roi_masks"][b]
             #roi_masks_cor = batch_cor["roi_masks"][b]
             if roi_masks.sum()>0:
                 boxes = batch_boxes[b]
                 roi_masks = torch.tensor(roi_masks).cuda().type(dtype=torch.float32)
                 box_ids = torch.arange(roi_masks.shape[0]).cuda().unsqueeze(1).type(dtype=torch.float32)
                 masks = tv.ops.roi_align(roi_masks, [boxes], cf.mask_shape)
                 masks = masks.squeeze(1)
                 masks = torch.round(masks)
                 masks_own = self.ra_ext.roi_align_2d(roi_masks, torch.cat((box_ids, boxes), dim=1), cf.mask_shape)
                 boxes = boxes.type(torch.int)
                 #print("check roi mask", roi_masks[0, 0, boxes[0][0]:boxes[0][2], boxes[0][1]:boxes[0][3]].sum(), (boxes[0][2]-boxes[0][0]) * (boxes[0][3]-boxes[0][1]))
                 #print("batch masks", batch["roi_masks"])
                 masks_own = masks_own.squeeze(1)
                 masks_own = torch.round(masks_own)
                 #import IPython; IPython.embed()
                 for mix, mask in enumerate(masks):
                     fig = plg.plt.figure()
                     ax = fig.add_subplot()
                     ax.imshow(roi_masks[mix][0].cpu().numpy(), cmap="gray", vmin=0.)
                     ax.axis("off")
                     y1, x1, y2, x2 = boxes[mix]
                     bbox = plg.mpatches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=0.9, edgecolor="c", facecolor='none')
                     ax.add_patch(bbox)
                     x1, y1, x2, y2 = boxes[mix]
                     bbox = plg.mpatches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=0.9, edgecolor="r",
                                                   facecolor='none')
                     ax.add_patch(bbox)
                     debug_dir = Path("/home/gregor/Documents/regrcnn/datasets/toy/experiments/debugroial")
                     os.makedirs(debug_dir, exist_ok=True)
                     plg.plt.savefig(debug_dir/"mask_b{}_{}.png".format(b, mix))
                     plg.plt.imsave(debug_dir/"mask_b{}_{}_pooled_tv.png".format(b, mix), mask.cpu().numpy(), cmap="gray", vmin=0.)
                     plg.plt.imsave(debug_dir/"mask_b{}_{}_pooled_own.png".format(b, mix), masks_own[mix].cpu().numpy(), cmap="gray", vmin=0.)
         return
 
     def test(self):
         # dynamically import module so that it doesn't affect other tests if import fails
         self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py')
 
         self.specific_example_check()
 
         # 2d test
         #self.check_2d()
 
         # 3d test
         self.check_3d()
 
         return
 
 
 class CheckRuntimeErrors(unittest.TestCase):
     """ Check if minimal examples of the exec.py module finish without runtime errors.
         This check requires a working path to data in the toy-dataset configs.
     """
 
     def test(self):
         cf = utils.import_module("toy_cf", 'datasets/toy/configs.py').Configs()
         exp_dir = "./unittesting/"
         #checks = {"retina_net": False, "mrcnn": False}
         #print("Testing for runtime errors with models {}".format(list(checks.keys())))
         #for model in tqdm.tqdm(list(checks.keys())):
             # cf.model = model
             # cf.model_path = 'models/{}.py'.format(cf.model if not 'retina' in cf.model else 'retina_net')
             # cf.model_path = os.path.join(cf.source_dir, cf.model_path)
             # {'mrcnn': cf.add_mrcnn_configs,
             #  'retina_net': cf.add_mrcnn_configs, 'retina_unet': cf.add_mrcnn_configs,
             #  'detection_unet': cf.add_det_unet_configs, 'detection_fpn': cf.add_det_fpn_configs
             #  }[model]()
         # todo change structure of configs-handling with exec.py so that its dynamically parseable instead of needing to
         # todo be changed in the file all the time.
         checks = {cf.model:False}
         completed_process = subprocess.run("python exec.py --dev --dataset_name toy -m train_test --exp_dir {}".format(exp_dir),
                                            shell=True, capture_output=True, text=True)
         if completed_process.returncode!=0:
             print("Runtime test of model {} failed due to\n{}".format(cf.model, completed_process.stderr))
         else:
             checks[cf.model] = True
         subprocess.call("rm -rf {}".format(exp_dir), shell=True)
         assert all(checks.values()), "A runtime test crashed."
 
 class MulithreadedDataiterator(unittest.TestCase):
 
     def test(self):
         print("Testing multithreaded iterator.")
 
 
         dataset = "toy"
         exp_dir = Path("datasets/{}/experiments/dev".format(dataset))
         cf_file = utils.import_module("cf_file", exp_dir/"configs.py")
         cf = cf_file.Configs()
         dloader = utils.import_module('data_loader', 'datasets/{}/data_loader.py'.format(dataset))
         cf.exp_dir = Path(exp_dir)
         cf.n_workers = 5
 
         cf.batch_size = 3
         cf.fold = 0
         cf.plot_dir = cf.exp_dir / "plots"
         logger = utils.get_logger(cf.exp_dir, cf.server_env, cf.sysmetrics_interval)
         cf.num_val_batches = "all"
         cf.val_mode = "val_sampling"
         cf.n_workers = 8
         batch_gens = dloader.get_train_generators(cf, logger, data_statistics=False)
         val_loader = batch_gens["val_sampling"]
 
         for epoch in range(4):
             produced_ids = []
             for i in range(batch_gens['n_val']):
                 batch = next(val_loader)
                 produced_ids.append(batch["pid"])
             uni, cts = np.unique(np.concatenate(produced_ids), return_counts=True)
             assert np.all(cts < 3), "with batch size one: every item should occur exactly once.\n uni {}, cts {}".format(
                 uni[cts>2], cts[cts>2])
             #assert len(np.setdiff1d(val_loader.generator.dataset_pids, uni))==0, "not all val pids were shown."
             assert len(np.setdiff1d(uni, val_loader.generator.dataset_pids))==0, "pids shown that are not val set. impossible?"
 
         cf.n_workers = os.cpu_count()
         cf.batch_size = int(val_loader.generator.dataset_length / cf.n_workers) + 2
         val_loader = dloader.create_data_gen_pipeline(cf, val_loader.generator._data, do_aug=False, sample_pids_w_replace=False,
                                                              max_batches=None, raise_stop_iteration=True)
         for epoch in range(2):
             produced_ids = []
             for b, batch in enumerate(val_loader):
                 produced_ids.append(batch["pid"])
             uni, cts = np.unique(np.concatenate(produced_ids), return_counts=True)
             assert np.all(cts == 1), "with batch size one: every item should occur exactly once.\n uni {}, cts {}".format(
                 uni[cts>1], cts[cts>1])
             assert len(np.setdiff1d(val_loader.generator.dataset_pids, uni))==0, "not all val pids were shown."
             assert len(np.setdiff1d(uni, val_loader.generator.dataset_pids))==0, "pids shown that are not val set. impossible?"
 
 
 
 
         pass
 
 
 if __name__=="__main__":
     stime = time.time()
 
     t = CheckRoIAlignImplementation()
     t.manual_check()
     #unittest.main()
 
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     print("{} total runtime: {}".format(os.path.split(__file__)[1], t))
\ No newline at end of file