diff --git a/evaluator.py b/evaluator.py index 682d686..89ca084 100644 --- a/evaluator.py +++ b/evaluator.py @@ -1,971 +1,967 @@ #!/usr/bin/env python # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os from multiprocessing import Pool import pickle import time import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.metrics import roc_curve, precision_recall_curve from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score import torch import utils.model_utils as mutils import plotting as plg import warnings def get_roi_ap_from_df(inputs): ''' :param df: data frame. :param det_thresh: min_threshold for filtering out low confidence predictions. :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results, instead of computing one ap over whole data set. :return: average_precision (float) ''' df, det_thresh, per_patient_ap = inputs if per_patient_ap: pids_list = df.pid.unique() aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] for pid in pids_list: pid_df = iou_df[iou_df.pid == pid] all_p = len(pid_df[pid_df.class_label == 1]) pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) pid_df = pid_df[pid_df.pred_score > det_thresh] if (len(pid_df) ==0 and all_p == 0): pass elif (len(pid_df) > 0 and all_p == 0): aps.append(0) else: aps.append(compute_roi_ap(pid_df, all_p)) return np.mean(aps) else: aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] # it's important to not apply the threshold before counting all_p in order to not lose the fn! all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')]) # sorting out all entries that are not fp or tp or have confidence(=pred_score) <= detection_threshold iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) iou_df = iou_df[iou_df.pred_score > det_thresh] if all_p>0: aps.append(compute_roi_ap(iou_df, all_p)) return np.mean(aps) def compute_roi_ap(df, all_p): """ adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score. :param all_p: number of all ground truth objects. (for denominator of recall.) :return: """ tp = df.class_label.values fp = (tp == 0) * 1 #recall thresholds, where precision will be measured R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True) tp_sum = np.cumsum(tp) fp_sum = np.cumsum(fp) n_dets = len(tp) rc = tp_sum / all_p pr = tp_sum / (fp_sum + tp_sum) # initialize precision array over recall steps (q=queries). q = [0. for _ in range(len(R))] # numpy is slow without cython optimization for accessing elements # python array gets significant speed improvement pr = pr.tolist() for i in range(n_dets - 1, 0, -1): if pr[i] > pr[i - 1]: pr[i - 1] = pr[i] #--> pr[i]<=pr[i-1] for all i since we want to consider the maximum #precision value for a queried interval # discretize empiric recall steps with given bins. assert np.all(rc[:-1]<=rc[1:]), "recall not sorted ascendingly" inds = np.searchsorted(rc, R, side='left') try: for rc_ix, pr_ix in enumerate(inds): q[rc_ix] = pr[pr_ix] except IndexError: #now q is filled with pr values up to first non-available index pass return np.mean(q) def roi_avp(inputs): ''' :param df: data frame. :param det_thresh: min_threshold for filtering out low confidence predictions. :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results, instead of computing one ap over whole data set. :return: average_precision (float) ''' df, det_thresh, per_patient_ap = inputs if per_patient_ap: pids_list = df.pid.unique() aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] for pid in pids_list: pid_df = iou_df[iou_df.pid == pid] all_p = len(pid_df[pid_df.class_label == 1]) mask = ((pid_df.rg_bins == pid_df.rg_bin_target) & (pid_df.det_type == 'det_tp')) | (pid_df.det_type == 'det_fp') pid_df = pid_df[mask].sort_values('pred_score', ascending=False) pid_df = pid_df[pid_df.pred_score > det_thresh] if (len(pid_df) ==0 and all_p == 0): pass elif (len(pid_df) > 0 and all_p == 0): aps.append(0) else: aps.append(compute_roi_ap(pid_df, all_p)) return np.mean(aps) else: aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] #it's important to not apply the threshold before counting all_positives! all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')]) # filtering out tps which don't match rg_bin target at this point is same as reclassifying them as fn. # also sorting out all entries that are not fp or have confidence(=pred_score) <= detection_threshold mask = ((iou_df.rg_bins == iou_df.rg_bin_target) & (iou_df.det_type == 'det_tp')) | (iou_df.det_type == 'det_fp') iou_df = iou_df[mask].sort_values('pred_score', ascending=False) iou_df = iou_df[iou_df.pred_score > det_thresh] if all_p>0: aps.append(compute_roi_ap(iou_df, all_p)) return np.mean(aps) def compute_prc(df): """compute precision-recall curve with maximum precision per recall interval. :param df: :param all_p: # of all positive samples in data. :return: array: [precisions, recall query values] """ assert (df.class_label==1).any(), "cannot compute prc when no positives in data." all_p = len(df[(df.det_type == 'det_tp') | (df.det_type == 'det_fn')]) df = df[(df.det_type=="det_tp") | (df.det_type=="det_fp")] df = df.sort_values("pred_score", ascending=False) # recall thresholds, where precision will be measured scores = df.pred_score.values labels = df.class_label.values n_dets = len(scores) pr = np.zeros((n_dets,)) rc = pr.copy() for rank in range(n_dets): tp = np.count_nonzero(labels[:rank+1]==1) fp = np.count_nonzero(labels[:rank+1]==0) pr[rank] = tp/(tp+fp) rc[rank] = tp/all_p #after obj detection convention/ coco-dataset template: take maximum pr within intervals: # --> pr[i]<=pr[i-1] for all i since we want to consider the maximum # precision value for a queried interval for i in range(n_dets - 1, 0, -1): if pr[i] > pr[i - 1]: pr[i - 1] = pr[i] R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)#precision queried at R points inds = np.searchsorted(rc, R, side='left') queries = np.zeros((len(R),)) try: for q_ix, rank in enumerate(inds): queries[q_ix] = pr[rank] except IndexError: pass return np.array((queries, R)) def RMSE(y_true, y_pred, weights=None): if len(y_true)>0: return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=weights)) else: return np.nan def MAE_w_std(y_true, y_pred, weights=None): if len(y_true)>0: y_true, y_pred = np.array(y_true), np.array(y_pred) deltas = np.abs(y_true-y_pred) mae = np.average(deltas, weights=weights, axis=0).item() skmae = mean_absolute_error(y_true, y_pred, sample_weight=weights) assert np.allclose(mae, skmae, atol=1e-6), "mae {}, sklearn mae {}".format(mae, skmae) std = np.std(weights*deltas) return mae, std else: return np.nan, np.nan def MAE(y_true, y_pred, weights=None): if len(y_true)>0: return mean_absolute_error(y_true, y_pred, sample_weight=weights) else: return np.nan def accuracy(y_true, y_pred, weights=None): if len(y_true)>0: return accuracy_score(y_true, y_pred, sample_weight=weights) else: return np.nan # noinspection PyCallingNonCallable class Evaluator(): """ Evaluates given results dicts. Can return results as updated monitor_metrics. Can save test data frames to file. """ def __init__(self, cf, logger, mode='test'): """ :param mode: either 'train', 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms. """ self.cf = cf self.logger = logger self.mode = mode self.regress_flag = any(['regression' in task for task in self.cf.prediction_tasks]) self.plot_dir = self.cf.plot_dir if not self.mode == "test" else self.cf.test_dir if self.cf.plot_prediction_histograms: self.hist_dir = os.path.join(self.plot_dir, 'histograms') os.makedirs(self.hist_dir, exist_ok=True) if self.cf.plot_stat_curves: self.curves_dir = os.path.join(self.plot_dir, 'stat_curves') os.makedirs(self.curves_dir, exist_ok=True) def eval_losses(self, batch_res_dicts): if hasattr(self.cf, "losses_to_monitor"): loss_names = self.cf.losses_to_monitor else: loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name} self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name in b_res_dict.keys()]).mean().item() for l_name in loss_names} def eval_segmentations(self, batch_res_dicts, pid_list): batch_dices = [b_res_dict['batch_dices'] for b_res_dict in batch_res_dicts if 'batch_dices' in b_res_dict.keys()] # shape (n_batches, n_seg_classes) if len(batch_dices) > 0: batch_dices = np.array(batch_dices) # dims n_batches x 1 in sampling / n_test_epochs x n_classes assert batch_dices.shape[1] == self.cf.num_seg_classes, "bdices shp {}, n seg cl {}, pid lst len {}".format( batch_dices.shape, self.cf.num_seg_classes, len(pid_list)) self.seg_df = pd.DataFrame() for seg_id in range(batch_dices.shape[1]): self.seg_df[self.cf.seg_id2label[seg_id].name + "_dice"] = batch_dices[:, seg_id] # one row== one batch, one column== one class # self.seg_df[self.cf.seg_id2label[seg_id].name+"_dice"] = np.concatenate(batch_dices[:,:,seg_id]) self.seg_df['fold'] = self.cf.fold if self.mode == "val_patient" or self.mode == "test": # need to make it more conform between sampling and patient-mode self.seg_df["pid"] = [pid for pix, pid in enumerate(pid_list)] # for b_inst in batch_inst_boxes[pix]] else: self.seg_df["pid"] = np.nan def eval_boxes(self, batch_res_dicts, pid_list, obj_cl_dict, obj_cl_identifiers={"gt":'class_targets', "pred":'box_pred_class_id'}): """ :param batch_res_dicts: :param pid_list: [pid_0, pid_1, ...] :return: """ if self.mode == 'train' or self.mode == 'val_sampling': # one pid per batch element # batch_size > 1, with varying patients across batch: # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [results_0, results_1, ..] batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # len: nr of batches in epoch batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in whole_batch_boxes] # len: batch instances of whole epoch assert np.all(len(b_boxes_list) == self.cf.batch_size for b_boxes_list in batch_inst_boxes) elif self.mode == "val_patient" or self.mode == "test": # patient processing, one element per batch = one patient. # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] # in patientbatchiterator there is only one pid per batch batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # in patient mode not actually per batch instance, but per whole batch! if hasattr(self.cf, "eval_test_separately") and self.cf.eval_test_separately: """ you could write your own routines to add GTs to raw predictions for evaluation. implemented standard is: cf.eval_test_separately = False or not set --> GTs are saved at same time and in same file as raw prediction results. """ raise NotImplementedError assert len(batch_inst_boxes) == len(pid_list) df_list_preds = [] df_list_labels = [] df_list_class_preds = [] df_list_pids = [] df_list_type = [] df_list_match_iou = [] df_list_n_missing = [] df_list_regressions = [] df_list_rg_targets = [] df_list_rg_bins = [] df_list_rg_bin_targets = [] df_list_rg_uncs = [] for match_iou in self.cf.ap_match_ious: self.logger.info('evaluating with ap_match_iou: {}'.format(match_iou)) for cl in list(obj_cl_dict.keys()): for pix, pid in enumerate(pid_list): len_df_list_before_patient = len(df_list_pids) # input of each batch element is a list of boxes, where each box is a dictionary. for b_inst_ix, b_boxes_list in enumerate(batch_inst_boxes[pix]): b_tar_boxes = [] b_cand_boxes, b_cand_scores, b_cand_n_missing = [], [], [] if self.regress_flag: b_tar_regs, b_tar_rg_bins = [], [] b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = [], [], [] for box in b_boxes_list: # each box is either gt or detection or proposal/anchor # we need all gts in the same order & all dets in same order if box['box_type'] == 'gt' and box[obj_cl_identifiers["gt"]] == cl: b_tar_boxes.append(box["box_coords"]) if self.regress_flag: b_tar_regs.append(np.array(box['regression_targets'], dtype='float32')) b_tar_rg_bins.append(box['rg_bin_targets']) if box['box_type'] == 'det' and box[obj_cl_identifiers["pred"]] == cl: b_cand_boxes.append(box["box_coords"]) b_cand_scores.append(box["box_score"]) b_cand_n_missing.append(box["cluster_n_missing"] if 'cluster_n_missing' in box.keys() else np.nan) if self.regress_flag: b_cand_regs.append(box["regression"]) b_cand_rg_bins.append(box["rg_bin"]) b_cand_rg_uncs.append(box["rg_uncertainty"] if 'rg_uncertainty' in box.keys() else np.nan) b_tar_boxes = np.array(b_tar_boxes) b_cand_boxes, b_cand_scores, b_cand_n_missing = np.array(b_cand_boxes), np.array(b_cand_scores), np.array(b_cand_n_missing) if self.regress_flag: b_tar_regs, b_tar_rg_bins = np.array(b_tar_regs), np.array(b_tar_rg_bins) b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = np.array(b_cand_regs), np.array(b_cand_rg_bins), np.array(b_cand_rg_uncs) # check if predictions and ground truth boxes exist and match them according to match_iou. if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: assert np.all(np.round(b_cand_scores,6) <= 1.), "there is a box score>1: {}".format(b_cand_scores[~(b_cand_scores<=1.)]) #coords_check = np.array([len(coords)==self.cf.dim*2 for coords in b_cand_boxes]) #assert np.all(coords_check), "cand box with wrong bcoords dim: {}, mode: {}".format(b_cand_boxes[~coords_check], self.mode) expected_dim = len(b_cand_boxes[0]) assert np.all([len(coords) == expected_dim for coords in b_tar_boxes]), \ "gt/cand box coords mismatch, expected dim: {}.".format(expected_dim) # overlaps: shape len(cand_boxes) x len(tar_boxes) overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes) # match_cand_ixs: shape (nr_of_matches,) # theses indices are the indices of b_cand_boxes match_cand_ixs = np.argwhere(np.max(overlaps, axis=1) > match_iou)[:, 0] non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0] # the corresponding gt assigned to the pred boxes by highest iou overlap, # i.e., match_gt_ixs holds index into b_tar_boxes for each entry in match_cand_ixs, # i.e., gt_ixs and cand_ixs are paired via their position in their list # (cand_ixs[j] corresponds to gt_ixs[j]) match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :], axis=1) if \ not 0 in match_cand_ixs.shape else np.array([]) assert len(match_gt_ixs)==len(match_cand_ixs) #match_gt_ixs: shape (nr_of_matches,) or 0 non_match_gt_ixs = np.array( [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs]) unique, counts = np.unique(match_gt_ixs, return_counts=True) # check for double assignments, i.e. two predictions having been assigned to the same gt. # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as # false positive. This case is supposed to be avoided by the model itself by, # e.g. using a low enough NMS threshold. if np.any(counts > 1): double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]] keep_max = [] double_match_list = [] for dg in double_match_gt_ixs: double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)] keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])]) double_match_list += [ii for ii in double_match_cand_ixs] fp_ixs = np.array([ii for ii in match_cand_ixs if (ii in double_match_list and ii not in keep_max)]) # count as fp: boxes that match gt above match_iou threshold but have not highest class confidence score match_gt_ixs = np.array([gt_ix for ii, gt_ix in enumerate(match_gt_ixs) if match_cand_ixs[ii] not in fp_ixs]) match_cand_ixs = np.array([cand_ix for cand_ix in match_cand_ixs if cand_ix not in fp_ixs]) assert len(match_gt_ixs) == len(match_cand_ixs) df_list_preds += [ii for ii in b_cand_scores[fp_ixs]] df_list_labels += [0] * fp_ixs.shape[0] # means label==gt==0==bg for all these fp_ixs df_list_class_preds += [cl] * fp_ixs.shape[0] df_list_n_missing += [n for n in b_cand_n_missing[fp_ixs]] if self.regress_flag: df_list_regressions += [r for r in b_cand_regs[fp_ixs]] df_list_rg_bins += [r for r in b_cand_rg_bins[fp_ixs]] df_list_rg_uncs += [r for r in b_cand_rg_uncs[fp_ixs]] df_list_rg_targets += [[0.]*self.cf.regression_n_features] * fp_ixs.shape[0] df_list_rg_bin_targets += [0.] * fp_ixs.shape[0] df_list_pids += [pid] * fp_ixs.shape[0] df_list_type += ['det_fp'] * fp_ixs.shape[0] # matched/tp: if not 0 in match_cand_ixs.shape: df_list_preds += list(b_cand_scores[match_cand_ixs]) df_list_labels += [1] * match_cand_ixs.shape[0] df_list_class_preds += [cl] * match_cand_ixs.shape[0] df_list_n_missing += list(b_cand_n_missing[match_cand_ixs]) if self.regress_flag: df_list_regressions += list(b_cand_regs[match_cand_ixs]) df_list_rg_bins += list(b_cand_rg_bins[match_cand_ixs]) df_list_rg_uncs += list(b_cand_rg_uncs[match_cand_ixs]) assert len(match_cand_ixs)==len(match_gt_ixs) df_list_rg_targets += list(b_tar_regs[match_gt_ixs]) df_list_rg_bin_targets += list(b_tar_rg_bins[match_gt_ixs]) df_list_pids += [pid] * match_cand_ixs.shape[0] df_list_type += ['det_tp'] * match_cand_ixs.shape[0] # rest fp: if not 0 in non_match_cand_ixs.shape: df_list_preds += list(b_cand_scores[non_match_cand_ixs]) df_list_labels += [0] * non_match_cand_ixs.shape[0] df_list_class_preds += [cl] * non_match_cand_ixs.shape[0] df_list_n_missing += list(b_cand_n_missing[non_match_cand_ixs]) if self.regress_flag: df_list_regressions += list(b_cand_regs[non_match_cand_ixs]) df_list_rg_bins += list(b_cand_rg_bins[non_match_cand_ixs]) df_list_rg_uncs += list(b_cand_rg_uncs[non_match_cand_ixs]) df_list_rg_targets += [[0.]*self.cf.regression_n_features] * non_match_cand_ixs.shape[0] df_list_rg_bin_targets += [0.] * non_match_cand_ixs.shape[0] df_list_pids += [pid] * non_match_cand_ixs.shape[0] df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0] # fn: if not 0 in non_match_gt_ixs.shape: df_list_preds += [0] * non_match_gt_ixs.shape[0] df_list_labels += [1] * non_match_gt_ixs.shape[0] df_list_class_preds += [cl] * non_match_gt_ixs.shape[0] df_list_n_missing += [np.nan] * non_match_gt_ixs.shape[0] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] * non_match_gt_ixs.shape[0] df_list_rg_bins += [0.] * non_match_gt_ixs.shape[0] df_list_rg_uncs += [np.nan] * non_match_gt_ixs.shape[0] df_list_rg_targets += list(b_tar_regs[non_match_gt_ixs]) df_list_rg_bin_targets += list(b_tar_rg_bins[non_match_gt_ixs]) df_list_pids += [pid] * non_match_gt_ixs.shape[0] df_list_type += ['det_fn'] * non_match_gt_ixs.shape[0] # only fp: if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape: # means there is no gt in all samples! any preds have to be fp. df_list_preds += list(b_cand_scores) df_list_labels += [0] * b_cand_boxes.shape[0] df_list_class_preds += [cl] * b_cand_boxes.shape[0] df_list_n_missing += list(b_cand_n_missing) if self.regress_flag: df_list_regressions += list(b_cand_regs) df_list_rg_bins += list(b_cand_rg_bins) df_list_rg_uncs += list(b_cand_rg_uncs) df_list_rg_targets += [[0.]*self.cf.regression_n_features] * b_cand_boxes.shape[0] df_list_rg_bin_targets += [0.] * b_cand_boxes.shape[0] df_list_pids += [pid] * b_cand_boxes.shape[0] df_list_type += ['det_fp'] * b_cand_boxes.shape[0] # only fn: if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: df_list_preds += [0] * b_tar_boxes.shape[0] df_list_labels += [1] * b_tar_boxes.shape[0] df_list_class_preds += [cl] * b_tar_boxes.shape[0] df_list_n_missing += [np.nan] * b_tar_boxes.shape[0] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] * b_tar_boxes.shape[0] df_list_rg_bins += [0.] * b_tar_boxes.shape[0] df_list_rg_uncs += [np.nan] * b_tar_boxes.shape[0] df_list_rg_targets += list(b_tar_regs) df_list_rg_bin_targets += list(b_tar_rg_bins) df_list_pids += [pid] * b_tar_boxes.shape[0] df_list_type += ['det_fn'] * b_tar_boxes.shape[0] # empty patient with 0 detections needs empty patient score, in order to not disappear from stats. # filtered out for roi-level evaluation later. During training (and val_sampling), # tn are assigned per sample independently of associated patients. # i.e., patient_tn is also meant as sample_tn if a list of samples is evaluated instead of whole patient if len(df_list_pids) == len_df_list_before_patient: df_list_preds += [0] df_list_labels += [0] df_list_class_preds += [cl] df_list_n_missing += [np.nan] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] df_list_rg_bins += [0.] df_list_rg_uncs += [np.nan] df_list_rg_targets += [[0.]*self.cf.regression_n_features] df_list_rg_bin_targets += [0.] df_list_pids += [pid] df_list_type += ['patient_tn'] # true negative: no ground truth boxes, no detections. df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou)) self.test_df = pd.DataFrame() self.test_df['pred_score'] = df_list_preds self.test_df['class_label'] = df_list_labels # class labels are gt, 0,1, only indicate neg/pos (or bg/fg) remapped from all classes self.test_df['pred_class'] = df_list_class_preds # can be diff than 0,1 self.test_df['pid'] = df_list_pids self.test_df['det_type'] = df_list_type self.test_df['fold'] = self.cf.fold self.test_df['match_iou'] = df_list_match_iou self.test_df['cluster_n_missing'] = df_list_n_missing if self.regress_flag: self.test_df['regressions'] = df_list_regressions self.test_df['rg_targets'] = df_list_rg_targets self.test_df['rg_uncertainties'] = df_list_rg_uncs self.test_df['rg_bins'] = df_list_rg_bins - # super weird error: pandas does not properly add an attribute if column is named "rg_bin_targets" ... ?!? self.test_df['rg_bin_target'] = df_list_rg_bin_targets assert hasattr(self.test_df, "rg_bin_target") - #fn_df = self.test_df[self.test_df["det_type"] == "det_fn"] - - pass def evaluate_predictions(self, results_list, monitor_metrics=None): """ Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes. Resulting info of each prediction is stored as one line in an internal dataframe, with the keys: det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative) pred_class: foreground class which the object predicts. pid: corresponding patient-id. pred_score: confidence score [0, 1] fold: corresponding fold of CV. match_iou: utilized IoU for matching. :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form: [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...]) :param monitor_metrics (optional): dict of dicts with all metrics of previous epochs. :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch. """ # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches # we want to evaluate one batch_instance (= 2D or 3D image) at a time. self.logger.info('evaluating in mode {}'.format(self.mode)) batch_res_dicts = [batch[0] for batch in results_list] # len: nr of batches in epoch if self.mode == 'train' or self.mode=='val_sampling': # one pid per batch element # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [pid_0, pid_1, ...] # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]] elif self.mode == "val_patient" or self.mode=="test": # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...] # in patientbatchiterator there is only one pid per batch pid_list = [np.unique(batch[1]) for batch in results_list] assert np.all([len(pid)==1 for pid in pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(pid_list) pid_list = [pid[0] for pid in pid_list] else: raise Exception("undefined run mode encountered") self.eval_losses(batch_res_dicts) self.eval_segmentations(batch_res_dicts, pid_list) self.eval_boxes(batch_res_dicts, pid_list, self.cf.class_dict) if monitor_metrics is not None: # return all_stats, updated monitor_metrics return self.return_metrics(self.test_df, self.cf.class_dict, monitor_metrics) def return_metrics(self, df, obj_cl_dict, monitor_metrics=None, boxes_only=False): """ Calculates metric scores for internal data frame. Called directly from evaluate_predictions during training for monitoring, or from score_test_df during inference (for single folds or aggregated test set). Loops over foreground classes and score_levels ('roi' and/or 'patient'), gets scores and stores them. Optionally creates plots of prediction histograms and ROC/PR curves. :param df: Data frame that holds evaluated predictions. :param obj_cl_dict: Dict linking object-class ids to object-class names. E.g., {1: "bikes", 2 : "cars"}. Set in configs as cf.class_dict. :param monitor_metrics: dict of dicts with all metrics of previous epochs. This function adds metrics for current epoch and returns the same object. :param boxes_only: whether to produce metrics only for the boxes, not the segmentations. :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and score_level. :return: monitor_metrics """ # -------------- monitoring independent of class, score level ------------ if monitor_metrics is not None: for l_name in self.epoch_losses: monitor_metrics[l_name] = [self.epoch_losses[l_name]] # -------------- metrics calc dependent on class, score level ------------ all_stats = [] # all_stats: one entry per score_level per class for cl in list(obj_cl_dict.keys()): # bg eval is neglected cl_name = obj_cl_dict[cl] cl_df = df[df.pred_class == cl] if hasattr(self, "seg_df") and not boxes_only: dice_col = self.cf.seg_id2label[cl].name+"_dice" seg_cl_df = self.seg_df.loc[:,['pid', dice_col, 'fold']] for score_level in self.cf.report_score_level: stats_dict = {} stats_dict['name'] = 'fold_{} {} {}'.format(self.cf.fold, score_level, cl_name) # -------------- RoI-based ----------------- if score_level == 'rois': stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan if monitor_metrics is not None: tn = len(cl_df[cl_df.det_type == "patient_tn"]) tp = len(cl_df[(cl_df.det_type == "det_tp")&(cl_df.pred_score>self.cf.min_det_thresh)]) fp = len(cl_df[(cl_df.det_type == "det_fp")&(cl_df.pred_score>self.cf.min_det_thresh)]) fn = len(cl_df[cl_df.det_type == "det_fn"]) sens = np.divide(tp, (fn + tp)) monitor_metrics.update({"Bin_Stats/" + cl_name + "_fp": [fp], "Bin_Stats/" + cl_name + "_tp": [tp], "Bin_Stats/" + cl_name + "_fn": [fn], "Bin_Stats/" + cl_name + "_tn": [tn], "Bin_Stats/" + cl_name + "_sensitivity": [sens]}) # list wrapping only needed bc other metrics are recorded over all epochs; spec_df = cl_df[cl_df.det_type != 'patient_tn'] if self.regress_flag: # filter false negatives out for regression-only eval since regressor didn't predict truncd_df = spec_df[(((spec_df.det_type == "det_fp") | ( spec_df.det_type == "det_tp")) & spec_df.pred_score > self.cf.min_det_thresh)] truncd_df_tp = truncd_df[truncd_df.det_type == "det_tp"] weights, weights_tp = truncd_df.pred_score.tolist(), truncd_df_tp.pred_score.tolist() y_true, y_pred = truncd_df.rg_targets.tolist(), truncd_df.regressions.tolist() stats_dict["rg_RMSE"] = RMSE(y_true, y_pred) stats_dict["rg_MAE"] = MAE(y_true, y_pred) stats_dict["rg_RMSE_weighted"] = RMSE(y_true, y_pred, weights) stats_dict["rg_MAE_weighted"] = MAE(y_true, y_pred, weights) y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist() stats_dict["rg_MAE_weighted_tp"] = MAE(y_true, y_pred, weights_tp) stats_dict["rg_MAE_w_std_weighted_tp"] = MAE_w_std(y_true, y_pred, weights_tp) y_true, y_pred = truncd_df.rg_bin_target.tolist(), truncd_df.rg_bins.tolist() stats_dict["rg_bin_accuracy"] = accuracy(y_true, y_pred) stats_dict["rg_bin_accuracy_weighted"] = accuracy(y_true, y_pred, weights) y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist() stats_dict["rg_bin_accuracy_weighted_tp"] = accuracy(y_true, y_pred, weights_tp) if np.any(~truncd_df.rg_uncertainties.isna()): # det_fn are expected to be NaN so they drop out in means stats_dict.update({"rg_uncertainty": truncd_df.rg_uncertainties.mean(), "rg_uncertainty_tp": truncd_df_tp.rg_uncertainties.mean(), "rg_uncertainty_tp_weighted": (truncd_df_tp.rg_uncertainties * truncd_df_tp.pred_score).sum() / truncd_df_tp.pred_score.sum() }) if (spec_df.class_label==1).any(): stats_dict['ap'] = get_roi_ap_from_df((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap)) stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) if self.regress_flag: stats_dict['avp'] = roi_avp((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap)) else: stats_dict['ap'] = np.nan stats_dict['prc'] = np.nan stats_dict['avp'] = np.nan # np.nan is formattable by __format__ as a float, None-type is not if hasattr(self, "seg_df") and not boxes_only: stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() # mean per all rois in this epoch stats_dict["dice_std"] = seg_cl_df.loc[:,dice_col].std() # for the aggregated test set case, additionally get the scores of averaging over fold results. if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1: aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label==1).any(): aps.append(get_roi_ap_from_df((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap))) stats_dict['ap_folds_mean'] = np.mean(aps) if len(aps)>0 else np.nan stats_dict['ap_folds_std'] = np.std(aps) if len(aps)>0 else np.nan stats_dict['auc_folds_mean'] = np.nan stats_dict['auc_folds_std'] = np.nan if self.regress_flag: avps, accuracies, MAEs = [], [], [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label == 1).any(): avps.append(roi_avp((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap))) truncd_df_tp = fold_df[((fold_df.det_type == "det_tp") & fold_df.pred_score > self.cf.min_det_thresh)] weights_tp = truncd_df_tp.pred_score.tolist() y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist() accuracies.append(accuracy(y_true, y_pred, weights_tp)) y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist() MAEs.append(MAE_w_std(y_true, y_pred, weights_tp)) stats_dict['avp_folds_mean'] = np.mean(avps) if len(avps) > 0 else np.nan stats_dict['avp_folds_std'] = np.std(avps) if len(avps) > 0 else np.nan stats_dict['rg_bin_accuracy_weighted_tp_folds_mean'] = np.mean(accuracies) if len(accuracies) > 0 else np.nan stats_dict['rg_bin_accuracy_weighted_tp_folds_std'] = np.std(accuracies) if len(accuracies) > 0 else np.nan stats_dict['rg_MAE_w_std_weighted_tp_folds_mean'] = np.mean(MAEs, axis=0) if len(MAEs) > 0 else np.nan stats_dict['rg_MAE_w_std_weighted_tp_folds_std'] = np.std(MAEs, axis=0) if len(MAEs) > 0 else np.nan if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1: fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"}) stats_dict["dice_folds_mean"] = float(fold_means.mean()) stats_dict["dice_folds_std"] = float(fold_means.std()) # -------------- patient-based ----------------- # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0. if score_level == 'patient': #this is the critical part in patient scoring: only the max gt and max pred score are taken per patient! #--> does mix up values from separate detections spec_df = cl_df.groupby(['pid'], as_index=False) agg_args = {'class_label': 'max', 'pred_score': 'max', 'fold': 'first'} if self.regress_flag: # pandas throws error if aggregated value is np.array, not if is list. agg_args.update({'regressions': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]), 'rg_targets': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]), 'rg_bins': 'max', 'rg_bin_target': 'max', 'rg_uncertainties': 'max' }) if hasattr(cl_df, "cluster_n_missing"): agg_args.update({'cluster_n_missing': 'mean'}) spec_df = spec_df.agg(agg_args) if len(spec_df.class_label.unique()) > 1: stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) else: stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan if (spec_df.class_label == 1).any(): patient_cl_labels = spec_df.class_label.tolist() stats_dict['ap'] = average_precision_score(patient_cl_labels, spec_df.pred_score.tolist()) stats_dict['prc'] = precision_recall_curve(patient_cl_labels, spec_df.pred_score.tolist()) if self.regress_flag: avp_scores = spec_df[spec_df.rg_bins == spec_df.rg_bin_target].pred_score.tolist() avp_scores += [0.] * (len(patient_cl_labels) - len(avp_scores)) stats_dict['avp'] = average_precision_score(patient_cl_labels, avp_scores) else: stats_dict['ap'] = np.nan stats_dict['prc'] = np.nan stats_dict['avp'] = np.nan if self.regress_flag: y_true, y_pred = spec_df.rg_targets.tolist(), spec_df.regressions.tolist() stats_dict["rg_RMSE"] = RMSE(y_true, y_pred) stats_dict["rg_MAE"] = MAE(y_true, y_pred) stats_dict["rg_bin_accuracy"] = accuracy(spec_df.rg_bin_target.tolist(), spec_df.rg_bins.tolist()) stats_dict["rg_uncertainty"] = spec_df.rg_uncertainties.mean() if hasattr(self, "seg_df") and not boxes_only: seg_cl_df = seg_cl_df.groupby(['pid'], as_index=False).agg( {dice_col: "mean", "fold": "first"}) # mean of all rois per patient in this epoch stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() #mean of all patients stats_dict["dice_std"] = seg_cl_df.loc[:, dice_col].std() # for the aggregated test set case, additionally get the scores for averaging over fold results. if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1 and self.mode in ["test", "analysis"]: aucs = [] aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label==1).any(): aps.append( average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) if len(fold_df.class_label.unique())>1: aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) stats_dict['auc_folds_mean'] = np.mean(aucs) stats_dict['auc_folds_std'] = np.std(aucs) stats_dict['ap_folds_mean'] = np.mean(aps) stats_dict['ap_folds_std'] = np.std(aps) if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1: fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"}) stats_dict["dice_folds_mean"] = float(fold_means.mean()) stats_dict["dice_folds_std"] = float(fold_means.std()) all_stats.append(stats_dict) # -------------- monitoring, visualisation ----------------- # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level. patient_interests = [self.cf.class_dict[self.cf.patient_class_of_interest],] if hasattr(self.cf, "bin_dict"): patient_interests += [self.cf.bin_dict[self.cf.patient_bin_of_interest]] if monitor_metrics is not None and (score_level != 'patient' or cl_name in patient_interests): name = 'patient_'+cl_name if score_level == 'patient' else cl_name for metric in self.cf.metrics: if metric in stats_dict.keys(): monitor_metrics[name + '_'+metric].append(stats_dict[metric]) else: print("WARNING: skipped monitor metric {}_{} since not avail".format(name, metric)) # histograms if self.cf.plot_prediction_histograms: out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_{}'.format( self.cf.fold, self.mode, score_level, cl_name)) plg.plot_prediction_hist(self.cf, spec_df, out_filename) # analysis of the hyper-parameter cf.min_det_thresh, for optimization on validation set. if self.cf.scan_det_thresh and "val" in self.mode: conf_threshs = list(np.arange(0.8, 1, 0.02)) pool = Pool(processes=self.cf.n_workers) mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs] aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1) pool.close() pool.join() self.logger.info('results from scanning over det_threshs: {}'.format([[i, j] for i, j in zip(conf_threshs, aps)])) if self.cf.plot_stat_curves: out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode)) plg.plot_stat_curves(self.cf, all_stats, out_filename) if self.cf.plot_prediction_histograms and hasattr(df, "cluster_n_missing") and df.cluster_n_missing.notna().any(): out_filename = os.path.join(self.hist_dir, 'n_missing_hist_{}_{}.png'.format(self.cf.fold, self.mode)) plg.plot_wbc_n_missing(self.cf, df, outfile=out_filename) return all_stats, monitor_metrics def score_test_df(self, max_fold=None, internal_df=True): """ Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold, gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds, and calculates and writes out overall metrics. """ # this should maybe be extended to auc, ap stds. metrics_to_score = self.cf.metrics # + [ m+ext for m in self.cf.metrics if "dice" in m for ext in ["_std"]] if internal_df: self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pkl'.format(self.cf.fold))) if hasattr(self, "seg_df"): self.seg_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_seg_df.pkl'.format(self.cf.fold))) stats, _ = self.return_metrics(self.test_df, self.cf.class_dict) with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: handle.write('\n****************************\n') handle.write('\nresults for fold {}, {} \n'.format(self.cf.fold, time.strftime("%d/%m/%y %H:%M:%S"))) handle.write('\n****************************\n') handle.write('\nfold df shape {}\n \n'.format(self.test_df.shape)) for s in stats: for metric in metrics_to_score: if metric in s.keys(): #needed as long as no dice on patient level poss if "accuracy" in metric: handle.write('{} {:0.4f} '.format(metric, s[metric])) else: handle.write('{} {:0.3f} '.format(metric, s[metric])) else: print("WARNING: skipped metric {} since not avail".format(metric)) handle.write('{} \n'.format(s['name'])) fold_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_df.pkl' in ii]) fold_seg_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_seg_df.pkl' in ii]) for paths in [fold_df_paths, fold_seg_df_paths]: assert len(paths)<= self.cf.n_cv_splits, "found {} > nr of cv splits results dfs in {}".format(len(paths), self.cf.test_dir) if max_fold is None: max_fold = self.cf.n_cv_splits-1 if self.cf.fold == max_fold: print("max fold/overall stats triggered") if self.cf.evaluate_fold_means: metrics_to_score += [m + ext for m in self.cf.metrics for ext in ("_folds_mean", "_folds_std")] with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: self.cf.fold = 'overall' dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths] seg_dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_seg_df_paths] self.test_df = pd.concat(dfs_list, sort=True) if len(seg_dfs_list)>0: self.seg_df = pd.concat(seg_dfs_list, sort=True) stats, _ = self.return_metrics(self.test_df, self.cf.class_dict) handle.write('\n****************************\n') handle.write('\nOVERALL RESULTS \n') handle.write('\n****************************\n') handle.write('\ndf shape \n \n'.format(self.test_df.shape)) for s in stats: for metric in metrics_to_score: if metric in s.keys(): handle.write('{} {:0.3f} '.format(metric, s[metric])) handle.write('{} \n'.format(s['name'])) results_table_path = os.path.join(self.cf.test_dir,"../../", 'results_table.csv') with open(results_table_path, 'a') as handle: #---column headers--- handle.write('\n{},'.format("Experiment Name")) handle.write('{},'.format("Time Stamp")) handle.write('{},'.format("Samples Seen")) handle.write('{},'.format("Spatial Dim")) handle.write('{},'.format("Patch Size")) handle.write('{},'.format("CV Folds")) handle.write('{},'.format("{}-clustering IoU".format(self.cf.clustering))) handle.write('{},'.format("Merge-2D-to-3D IoU")) if hasattr(self.cf, "test_against_exact_gt"): handle.write('{},'.format('Exact GT')) for s in stats: assert "overall" in s['name'].split(" ")[0] if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name']: for metric in metrics_to_score: if metric in s.keys() and not np.isnan(s[metric]): if metric=='ap': handle.write('{}_{} : {}_{},'.format(*s['name'].split(" ")[1:], metric, int(np.mean(self.cf.ap_match_ious)*100))) elif not "folds_std" in metric: handle.write('{}_{} : {},'.format(*s['name'].split(" ")[1:], metric)) else: print("WARNING: skipped metric {} since not avail".format(metric)) handle.write('\n') #--- columns content--- handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1])) handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S"))) handle.write('{},'.format(self.cf.num_epochs*self.cf.num_train_batches*self.cf.batch_size)) handle.write('{}D,'.format(self.cf.dim)) handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)]))) handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", ""))) handle.write('{},'.format(self.cf.clustering_iou if self.cf.clustering else str("N/A"))) handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A"))) if hasattr(self.cf, "test_against_exact_gt"): handle.write('{},'.format(self.cf.test_against_exact_gt)) for s in stats: if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name']: for metric in metrics_to_score: if metric in s.keys() and not np.isnan(s[metric]): # needed as long as no dice on patient level possible if "folds_mean" in metric: handle.write('{:0.3f}\u00B1{:0.3f}, '.format(s[metric], s["_".join((*metric.split("_")[:-1], "std"))])) elif not "folds_std" in metric: handle.write('{:0.3f}, '.format(s[metric])) handle.write('\n') with open(os.path.join(self.cf.test_dir, 'results_extr_scores.txt'), 'w') as handle: handle.write('\n****************************\n') handle.write('\nextremal scores for fold {} \n'.format(self.cf.fold)) handle.write('\n****************************\n') # want: pid & fold (&other) of highest scoring tp & fp in test_df for cl in self.cf.class_dict.keys(): print("\nClass {}".format(self.cf.class_dict[cl]), file=handle) cl_df = self.test_df[self.test_df.pred_class == cl] #.dropna(axis=1) for det_type in ['det_tp', 'det_fp']: filtered_df = cl_df[cl_df.det_type==det_type] print("\nHighest scoring {} of class {}".format(det_type, self.cf.class_dict[cl]), file=handle) if len(filtered_df)>0: print(filtered_df.loc[filtered_df.pred_score.idxmax()], file=handle) else: print("No detections of type {} for class {} in this df".format(det_type, self.cf.class_dict[cl]), file=handle) handle.write('\n****************************\n') diff --git a/unittests.py b/unittests.py index 3b6c7a1..87c5f2c 100644 --- a/unittests.py +++ b/unittests.py @@ -1,625 +1,625 @@ #!/usr/bin/env python # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import unittest import os import pickle import time from multiprocessing import Pool import subprocess from pathlib import Path import numpy as np import pandas as pd import torch import torchvision as tv import tqdm import plotting as plg import utils.exp_utils as utils import utils.model_utils as mutils """ Note on unittests: run this file either in the way intended for unittests by starting the script with python -m unittest unittests.py or start it as a normal python file as python unittests.py. You can selective run single tests by calling python -m unittest unittests.TestClassOfYourChoice, where TestClassOfYourChoice is the name of the test defined below, e.g., CompareFoldSplits. """ def inspect_info_df(pp_dir): """ use your debugger to look into the info df of a pp dir. :param pp_dir: preprocessed-data directory """ info_df = pd.read_pickle(os.path.join(pp_dir, "info_df.pickle")) return def generate_boxes(count, dim=2, h=100, w=100, d=20, normalize=False, on_grid=False, seed=0): """ generate boxes of format [y1, x1, y2, x2, (z1, z2)]. :param count: nr of boxes :param dim: dimension of boxes (2 or 3) :return: boxes in format (n_boxes, 4 or 6), scores """ np.random.seed(seed) if on_grid: lower_y = np.random.randint(0, h // 2, (count,)) lower_x = np.random.randint(0, w // 2, (count,)) upper_y = np.random.randint(h // 2, h, (count,)) upper_x = np.random.randint(w // 2, w, (count,)) if dim == 3: lower_z = np.random.randint(0, d // 2, (count,)) upper_z = np.random.randint(d // 2, d, (count,)) else: lower_y = np.random.rand(count) * h / 2. lower_x = np.random.rand(count) * w / 2. upper_y = (np.random.rand(count) + 1.) * h / 2. upper_x = (np.random.rand(count) + 1.) * w / 2. if dim == 3: lower_z = np.random.rand(count) * d / 2. upper_z = (np.random.rand(count) + 1.) * d / 2. if dim == 3: boxes = np.array(list(zip(lower_y, lower_x, upper_y, upper_x, lower_z, upper_z))) # add an extreme box that tests the boundaries boxes = np.concatenate((boxes, np.array([[0., 0., h, w, 0, d]]))) else: boxes = np.array(list(zip(lower_y, lower_x, upper_y, upper_x))) boxes = np.concatenate((boxes, np.array([[0., 0., h, w]]))) scores = np.random.rand(count + 1) if normalize: divisor = np.array([h, w, h, w, d, d]) if dim == 3 else np.array([h, w, h, w]) boxes = boxes / divisor return boxes, scores #------- perform integrity checks on data set(s) ----------- class VerifyLIDCSAIntegrity(unittest.TestCase): """ Perform integrity checks on preprocessed single-annotator GTs of LIDC data set. """ @staticmethod def check_patient_sa_gt(pid, pp_dir, check_meta_files, check_info_df): faulty_cases = pd.DataFrame(columns=['pid', 'rater', 'cl_targets', 'roi_ids']) all_segs = np.load(os.path.join(pp_dir, pid + "_rois.npz"), mmap_mode='r') all_segs = all_segs[list(all_segs.keys())[0]] all_roi_ids = np.unique(all_segs[all_segs > 0]) assert len(all_roi_ids) == np.max(all_segs), "roi ids not consecutive" if check_meta_files: meta_file = os.path.join(pp_dir, pid + "_meta_info.pickle") with open(meta_file, "rb") as handle: info = pickle.load(handle) assert info["pid"] == pid, "wrong pid in meta_file" all_cl_targets = info["class_target"] if check_info_df: info_df = pd.read_pickle(os.path.join(pp_dir, "info_df.pickle")) pid_info = info_df[info_df.pid == pid] assert len(pid_info) == 1, "found {} entries for pid {} in info df, expected exactly 1".format(len(pid_info), pid) if check_meta_files: assert pid_info[ "class_target"] == all_cl_targets, "meta_info and info_df class targets mismatch:\n{}\n{}".format( pid_info["class_target"], all_cl_targets) all_cl_targets = pid_info["class_target"].iloc[0] assert len(all_roi_ids) == len(all_cl_targets) for rater in range(4): seg = all_segs[rater] roi_ids = np.unique(seg[seg > 0]) cl_targs = np.array([roi[rater] for roi in all_cl_targets]) assert np.count_nonzero(cl_targs) == len(roi_ids), "rater {} has targs {} but roi ids {}".format(rater, cl_targs, roi_ids) assert len(cl_targs) >= len(roi_ids), "not all marked rois have a label" for zeroix_roi_id, rating in enumerate(cl_targs): if not ((rating > 0) == (np.any(seg == zeroix_roi_id + 1))): print("\n\nFAULTY CASE:", end=" ", ) print("pid {}, rater {}, cl_targs {}, ids {}\n".format(pid, rater, cl_targs, roi_ids)) faulty_cases = faulty_cases.append( {'pid': pid, 'rater': rater, 'cl_targets': cl_targs, 'roi_ids': roi_ids}, ignore_index=True) print("finished checking pid {}, {} faulty cases".format(pid, len(faulty_cases))) return faulty_cases def check_sa_gts(cf, pp_dir, pid_subset=None, check_meta_files=False, check_info_df=True, processes=os.cpu_count()): report_name = "verify_seg_label_pairings.csv" pids = {file_name.split("_")[0] for file_name in os.listdir(pp_dir) if file_name not in [report_name, "info_df.pickle"]} if pid_subset is not None: pids = [pid for pid in pids if pid in pid_subset] faulty_cases = pd.DataFrame(columns=['pid', 'rater', 'cl_targets', 'roi_ids']) p = Pool(processes=processes) mp_args = zip(pids, [pp_dir]*len(pids), [check_meta_files]*len(pids), [check_info_df]*len(pids)) patient_cases = p.starmap(self.check_patient_sa_gt, mp_args) p.close(); p.join() faulty_cases = faulty_cases.append(patient_cases, sort=False) print("\n\nfaulty case count {}".format(len(faulty_cases))) print(faulty_cases) findings_file = os.path.join(pp_dir, "verify_seg_label_pairings.csv") faulty_cases.to_csv(findings_file) assert len(faulty_cases)==0, "there was a faulty case in data set {}.\ncheck {}".format(pp_dir, findings_file) def test(self): pp_root = "/media/gregor/HDD2TB/Documents/data/" pp_dir = "lidc/pp_20190805" gt_dir = os.path.join(pp_root, pp_dir, "patient_gts_sa") self.check_sa_gts(gt_dir, check_meta_files=True, check_info_df=False, pid_subset=None) # ["0811a", "0812a"]) #------ compare segmentation gts of preprocessed data sets ------ class CompareSegGTs(unittest.TestCase): """ load and compare pre-processed gts by dice scores of segmentations. """ @staticmethod def group_seg_paths(ref_path, comp_paths): # not working recursively ref_files = [fn for fn in os.listdir(ref_path) if os.path.isfile(os.path.join(ref_path, fn)) and 'seg' in fn and fn.endswith('.npy')] comp_files = [[os.path.join(c_path, fn) for c_path in comp_paths] for fn in ref_files] ref_files = [os.path.join(ref_path, fn) for fn in ref_files] return zip(ref_files, comp_files) @staticmethod def load_calc_dice(paths): dices = [] ref_seg = np.load(paths[0])[np.newaxis, np.newaxis] n_classes = len(np.unique(ref_seg)) ref_seg = mutils.get_one_hot_encoding(ref_seg, n_classes) for c_file in paths[1]: c_seg = np.load(c_file)[np.newaxis, np.newaxis] assert n_classes == len(np.unique(c_seg)), "unequal nr of objects/classes betw segs {} {}".format(paths[0], c_file) c_seg = mutils.get_one_hot_encoding(c_seg, n_classes) dice = mutils.dice_per_batch_inst_and_class(c_seg, ref_seg, n_classes, convert_to_ohe=False) dices.append(dice) print("processed ref_path {}".format(paths[0])) return np.mean(dices), np.std(dices) def iterate_files(self, grouped_paths, processes=os.cpu_count()): p = Pool(processes) means_stds = np.array(p.map(self.load_calc_dice, grouped_paths)) p.close(); p.join() min_dice = np.min(means_stds[:, 0]) print("min mean dice {:.2f}, max std {:.4f}".format(min_dice, np.max(means_stds[:, 1]))) assert min_dice > 1-1e5, "compared seg gts have insufficient minimum mean dice overlap of {}".format(min_dice) def test(self): ref_path = '/media/gregor/HDD2TB/Documents/data/prostate/data_t2_250519_ps384_gs6071' comp_paths = ['/media/gregor/HDD2TB/Documents/data/prostate/data_t2_190419_ps384_gs6071', ] paths = self.group_seg_paths(ref_path, comp_paths) self.iterate_files(paths) #------- check if cross-validation fold splits of different experiments are identical ---------- class CompareFoldSplits(unittest.TestCase): """ Find evtl. differences in cross-val file splits across different experiments. """ @staticmethod def group_id_paths(ref_exp_dir, comp_exp_dirs): f_name = 'fold_ids.pickle' ref_paths = os.path.join(ref_exp_dir, f_name) assert os.path.isfile(ref_paths), "ref file {} does not exist.".format(ref_paths) ref_paths = [ref_paths for comp_ed in comp_exp_dirs] comp_paths = [os.path.join(comp_ed, f_name) for comp_ed in comp_exp_dirs] return zip(ref_paths, comp_paths) @staticmethod def comp_fold_ids(mp_input): fold_ids1, fold_ids2 = mp_input with open(fold_ids1, 'rb') as f: fold_ids1 = pickle.load(f) try: with open(fold_ids2, 'rb') as f: fold_ids2 = pickle.load(f) except FileNotFoundError: print("comp file {} does not exist.".format(fold_ids2)) return n_splits = len(fold_ids1) assert n_splits == len(fold_ids2), "mismatch n splits: ref has {}, comp {}".format(n_splits, len(fold_ids2)) split_diffs = [np.setdiff1d(fold_ids1[s], fold_ids2[s]) for s in range(n_splits)] all_equal = np.any(split_diffs) return (split_diffs, all_equal) def iterate_exp_dirs(self, ref_exp, comp_exps, processes=os.cpu_count()): grouped_paths = list(self.group_id_paths(ref_exp, comp_exps)) print("performing {} comparisons of cross-val file splits".format(len(grouped_paths))) p = Pool(processes) split_diffs = p.map(self.comp_fold_ids, grouped_paths) p.close(); p.join() df = pd.DataFrame(index=range(0,len(grouped_paths)), columns=["ref", "comp", "all_equal"])#, "diffs"]) for ix, (ref, comp) in enumerate(grouped_paths): df.iloc[ix] = [ref, comp, split_diffs[ix][1]]#, split_diffs[ix][0]] print("Any splits not equal?", df.all_equal.any()) assert not df.all_equal.any(), "a split set is different from reference split set, {}".format(df[~df.all_equal]) def test(self): exp_parent_dir = '/home/gregor/networkdrives/E132-Cluster-Projects/prostate/experiments/' ref_exp = '/home/gregor/networkdrives/E132-Cluster-Projects/prostate/experiments/gs6071_detfpn2d_cl_bs10' comp_exps = [os.path.join(exp_parent_dir, p) for p in os.listdir(exp_parent_dir)] comp_exps = [p for p in comp_exps if os.path.isdir(p) and p != ref_exp] self.iterate_exp_dirs(ref_exp, comp_exps) #------- check if cross-validation fold splits of a single experiment are actually incongruent (as required) ---------- class VerifyFoldSplits(unittest.TestCase): """ Check, for a single fold_ids file, i.e., for a single experiment, if the assigned folds (assignment of data - identifiers) is actually incongruent. No overlaps between folds are required for a correct cross validation. + identifiers) is actually incongruent. No overlaps between folds are allowed for a correct cross validation. """ @staticmethod def verify_fold_ids(splits): for i, split1 in enumerate(splits): for j, split2 in enumerate(splits): if j > i: inter = np.intersect1d(split1, split2) if len(inter) > 0: raise Exception("Split {} and {} intersect by pids {}".format(i, j, inter)) def test(self): exp_dir = "/home/gregor/Documents/medicaldetectiontoolkit/datasets/lidc/experiments/dev" check_file = os.path.join(exp_dir, 'fold_ids.pickle') with open(check_file, 'rb') as handle: splits = pickle.load(handle) self.verify_fold_ids(splits) # -------- check own nms CUDA implement against own numpy implement ------ class CheckNMSImplementation(unittest.TestCase): @staticmethod def assert_res_equality(keep_ics1, keep_ics2, boxes, scores, tolerance=0, names=("res1", "res2")): """ :param keep_ics1: keep indices (results), torch.Tensor of shape (n_ics,) :param keep_ics2: :return: """ keep_ics1, keep_ics2 = keep_ics1.cpu().numpy(), keep_ics2.cpu().numpy() discrepancies = np.setdiff1d(keep_ics1, keep_ics2) try: checks = np.array([ len(discrepancies) <= tolerance ]) except: checks = np.zeros((1,)).astype("bool") msgs = np.array([ """{}: {} \n{}: {} \nboxes: {}\n {}\n""".format(names[0], keep_ics1, names[1], keep_ics2, boxes, scores) ]) assert np.all(checks), "NMS: results mismatch: " + "\n".join(msgs[~checks]) def single_case(self, count=20, dim=3, threshold=0.2, seed=0): boxes, scores = generate_boxes(count, dim, seed=seed, h=320, w=280, d=30) keep_numpy = torch.tensor(mutils.nms_numpy(boxes, scores, threshold)) # for some reason torchvision nms requires box coords as floats. boxes = torch.from_numpy(boxes).type(torch.float32) scores = torch.from_numpy(scores).type(torch.float32) if dim == 2: """need to wait until next pytorch release where they fixed nms on cpu (currently they have >= where it needs to be >. """ keep_ops = tv.ops.nms(boxes, scores, threshold) # self.assert_res_equality(keep_numpy, keep_ops, boxes, scores, tolerance=0, names=["np", "ops"]) pass boxes = boxes.cuda() scores = scores.cuda() keep = self.nms_ext.nms(boxes, scores, threshold) self.assert_res_equality(keep_numpy, keep, boxes, scores, tolerance=0, names=["np", "cuda"]) def test(self, n_cases=200, box_count=30, threshold=0.5): # dynamically import module so that it doesn't affect other tests if import fails self.nms_ext = utils.import_module("nms_ext", 'custom_extensions/nms/nms.py') # change seed to something fix if you want exactly reproducible test seed0 = np.random.randint(50) print("NMS test progress (done/total box configurations) 2D:", end="\n") for i in tqdm.tqdm(range(n_cases)): self.single_case(count=box_count, dim=2, threshold=threshold, seed=seed0+i) print("NMS test progress (done/total box configurations) 3D:", end="\n") for i in tqdm.tqdm(range(n_cases)): self.single_case(count=box_count, dim=3, threshold=threshold, seed=seed0+i) return class CheckRoIAlignImplementation(unittest.TestCase): def prepare(self, dim=2): b, c, h, w = 1, 3, 50, 50 # feature map, (b, c, h, w(, z)) if dim == 2: fmap = torch.rand(b, c, h, w).cuda() # rois = torch.tensor([[ # [0.1, 0.1, 0.3, 0.3], # [0.2, 0.2, 0.4, 0.7], # [0.5, 0.7, 0.7, 0.9], # ]]).cuda() pool_size = (7, 7) rois = generate_boxes(5, dim=dim, h=h, w=w, on_grid=True, seed=np.random.randint(50))[0] elif dim == 3: d = 20 fmap = torch.rand(b, c, h, w, d).cuda() # rois = torch.tensor([[ # [0.1, 0.1, 0.3, 0.3, 0.1, 0.1], # [0.2, 0.2, 0.4, 0.7, 0.2, 0.4], # [0.5, 0.0, 0.7, 1.0, 0.4, 0.5], # [0.0, 0.0, 0.9, 1.0, 0.0, 1.0], # ]]).cuda() pool_size = (7, 7, 3) rois = generate_boxes(5, dim=dim, h=h, w=w, d=d, on_grid=True, seed=np.random.randint(50), normalize=False)[0] else: raise ValueError("dim needs to be 2 or 3") rois = [torch.from_numpy(rois).type(dtype=torch.float32).cuda(), ] fmap.requires_grad_(True) return fmap, rois, pool_size def check_2d(self): """ check vs torchvision ops not possible as on purpose different approach. :return: """ raise NotImplementedError # fmap, rois, pool_size = self.prepare(dim=2) # ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1., sampling_ratio=-1) # align_ext = ra_object(fmap, rois) # loss_ext = align_ext.sum() # loss_ext.backward() # # rois_swapped = [rois[0][:, [1,3,0,2]]] # align_ops = tv.ops.roi_align(fmap, rois_swapped, pool_size) # loss_ops = align_ops.sum() # loss_ops.backward() # # assert (loss_ops == loss_ext), "sum of roialign ops and extension 2D diverges" # assert (align_ops == align_ext).all(), "ROIAlign failed 2D test" def check_3d(self): fmap, rois, pool_size = self.prepare(dim=3) ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1., sampling_ratio=-1) align_ext = ra_object(fmap, rois) loss_ext = align_ext.sum() loss_ext.backward() align_np = mutils.roi_align_3d_numpy(fmap.cpu().detach().numpy(), [roi.cpu().numpy() for roi in rois], pool_size) align_np = np.squeeze(align_np) # remove singleton batch dim align_ext = align_ext.cpu().detach().numpy() assert np.allclose(align_np, align_ext, rtol=1e-5, atol=1e-8), "RoIAlign differences in numpy and CUDA implement" def specific_example_check(self): # dummy input self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py') exp = 6 pool_size = (2,2) fmap = torch.arange(exp**2).view(exp,exp).unsqueeze(0).unsqueeze(0).cuda().type(dtype=torch.float32) boxes = torch.tensor([[1., 1., 5., 5.]]).cuda()/exp ind = torch.tensor([0.]*len(boxes)).cuda().type(torch.float32) y_exp, x_exp = fmap.shape[2:] # exp = expansion boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp], dtype=torch.float32).cuda()) boxes = torch.cat((ind.unsqueeze(1), boxes), dim=1) aligned_tv = tv.ops.roi_align(fmap, boxes, output_size=pool_size, sampling_ratio=-1) aligned = self.ra_ext.roi_align_2d(fmap, boxes, output_size=pool_size, sampling_ratio=-1) boxes_3d = torch.cat((boxes, torch.tensor([[-1.,1.]]*len(boxes)).cuda()), dim=1) fmap_3d = fmap.unsqueeze(dim=-1) pool_size = (*pool_size,1) ra_object = self.ra_ext.RoIAlign(output_size=pool_size, spatial_scale=1.,) aligned_3d = ra_object(fmap_3d, boxes_3d) # expected_res = torch.tensor([[[[10.5000, 12.5000], # this would be with an alternative grid-point setting # [22.5000, 24.5000]]]]).cuda() expected_res = torch.tensor([[[[14., 16.], [26., 28.]]]]).cuda() expected_res_3d = torch.tensor([[[[[14.],[16.]], [[26.],[28.]]]]]).cuda() assert torch.all(aligned==expected_res), "2D RoIAlign check vs. specific example failed. res: {}\n expected: {}\n".format(aligned, expected_res) assert torch.all(aligned_3d==expected_res_3d), "3D RoIAlign check vs. specific example failed. res: {}\n expected: {}\n".format(aligned_3d, expected_res_3d) def manual_check(self): """ print examples from a toy batch to file. :return: """ self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py') # actual mrcnn mask input from datasets.toy import configs cf = configs.Configs() cf.exp_dir = "datasets/toy/experiments/dev/" cf.plot_dir = cf.exp_dir + "plots" os.makedirs(cf.exp_dir, exist_ok=True) cf.fold = 0 cf.n_workers = 1 logger = utils.get_logger(cf.exp_dir) data_loader = utils.import_module('data_loader', os.path.join("datasets", "toy", 'data_loader.py')) batch_gen = data_loader.get_train_generators(cf, logger=logger) batch = next(batch_gen['train']) roi_mask = np.zeros((1, 320, 200)) bb_target = (np.array([50, 40, 90, 120])).astype("int") roi_mask[:, bb_target[0]+1:bb_target[2]+1, bb_target[1]+1:bb_target[3]+1] = 1. #batch = {"roi_masks": np.array([np.array([roi_mask, roi_mask]), np.array([roi_mask])]), "bb_target": [[bb_target, bb_target + 25], [bb_target-20]]} #batch_boxes_cor = [torch.tensor(batch_el_boxes).cuda().float() for batch_el_boxes in batch_cor["bb_target"]] batch_boxes = [torch.tensor(batch_el_boxes).cuda().float() for batch_el_boxes in batch["bb_target"]] #import IPython; IPython.embed() for b in range(len(batch_boxes)): roi_masks = batch["roi_masks"][b] #roi_masks_cor = batch_cor["roi_masks"][b] if roi_masks.sum()>0: boxes = batch_boxes[b] roi_masks = torch.tensor(roi_masks).cuda().type(dtype=torch.float32) box_ids = torch.arange(roi_masks.shape[0]).cuda().unsqueeze(1).type(dtype=torch.float32) masks = tv.ops.roi_align(roi_masks, [boxes], cf.mask_shape) masks = masks.squeeze(1) masks = torch.round(masks) masks_own = self.ra_ext.roi_align_2d(roi_masks, torch.cat((box_ids, boxes), dim=1), cf.mask_shape) boxes = boxes.type(torch.int) #print("check roi mask", roi_masks[0, 0, boxes[0][0]:boxes[0][2], boxes[0][1]:boxes[0][3]].sum(), (boxes[0][2]-boxes[0][0]) * (boxes[0][3]-boxes[0][1])) #print("batch masks", batch["roi_masks"]) masks_own = masks_own.squeeze(1) masks_own = torch.round(masks_own) #import IPython; IPython.embed() for mix, mask in enumerate(masks): fig = plg.plt.figure() ax = fig.add_subplot() ax.imshow(roi_masks[mix][0].cpu().numpy(), cmap="gray", vmin=0.) ax.axis("off") y1, x1, y2, x2 = boxes[mix] bbox = plg.mpatches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=0.9, edgecolor="c", facecolor='none') ax.add_patch(bbox) x1, y1, x2, y2 = boxes[mix] bbox = plg.mpatches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=0.9, edgecolor="r", facecolor='none') ax.add_patch(bbox) debug_dir = Path("/home/gregor/Documents/regrcnn/datasets/toy/experiments/debugroial") os.makedirs(debug_dir, exist_ok=True) plg.plt.savefig(debug_dir/"mask_b{}_{}.png".format(b, mix)) plg.plt.imsave(debug_dir/"mask_b{}_{}_pooled_tv.png".format(b, mix), mask.cpu().numpy(), cmap="gray", vmin=0.) plg.plt.imsave(debug_dir/"mask_b{}_{}_pooled_own.png".format(b, mix), masks_own[mix].cpu().numpy(), cmap="gray", vmin=0.) return def test(self): # dynamically import module so that it doesn't affect other tests if import fails self.ra_ext = utils.import_module("ra_ext", 'custom_extensions/roi_align/roi_align.py') self.specific_example_check() # 2d test #self.check_2d() # 3d test self.check_3d() return class CheckRuntimeErrors(unittest.TestCase): """ Check if minimal examples of the exec.py module finish without runtime errors. This check requires a working path to data in the toy-dataset configs. """ def test(self): cf = utils.import_module("toy_cf", 'datasets/toy/configs.py').Configs() exp_dir = "./unittesting/" #checks = {"retina_net": False, "mrcnn": False} #print("Testing for runtime errors with models {}".format(list(checks.keys()))) #for model in tqdm.tqdm(list(checks.keys())): # cf.model = model # cf.model_path = 'models/{}.py'.format(cf.model if not 'retina' in cf.model else 'retina_net') # cf.model_path = os.path.join(cf.source_dir, cf.model_path) # {'mrcnn': cf.add_mrcnn_configs, # 'retina_net': cf.add_mrcnn_configs, 'retina_unet': cf.add_mrcnn_configs, # 'detection_unet': cf.add_det_unet_configs, 'detection_fpn': cf.add_det_fpn_configs # }[model]() # todo change structure of configs-handling with exec.py so that its dynamically parseable instead of needing to # todo be changed in the file all the time. checks = {cf.model:False} completed_process = subprocess.run("python exec.py --dev --dataset_name toy -m train_test --exp_dir {}".format(exp_dir), shell=True, capture_output=True, text=True) if completed_process.returncode!=0: print("Runtime test of model {} failed due to\n{}".format(cf.model, completed_process.stderr)) else: checks[cf.model] = True subprocess.call("rm -rf {}".format(exp_dir), shell=True) assert all(checks.values()), "A runtime test crashed." class MulithreadedDataiterator(unittest.TestCase): def test(self): print("Testing multithreaded iterator.") dataset = "toy" exp_dir = Path("datasets/{}/experiments/dev".format(dataset)) cf_file = utils.import_module("cf_file", exp_dir/"configs.py") cf = cf_file.Configs() dloader = utils.import_module('data_loader', 'datasets/{}/data_loader.py'.format(dataset)) cf.exp_dir = Path(exp_dir) cf.n_workers = 5 cf.batch_size = 3 cf.fold = 0 cf.plot_dir = cf.exp_dir / "plots" logger = utils.get_logger(cf.exp_dir, cf.server_env, cf.sysmetrics_interval) cf.num_val_batches = "all" cf.val_mode = "val_sampling" cf.n_workers = 8 batch_gens = dloader.get_train_generators(cf, logger, data_statistics=False) val_loader = batch_gens["val_sampling"] for epoch in range(4): produced_ids = [] for i in range(batch_gens['n_val']): batch = next(val_loader) produced_ids.append(batch["pid"]) uni, cts = np.unique(np.concatenate(produced_ids), return_counts=True) assert np.all(cts < 3), "with batch size one: every item should occur exactly once.\n uni {}, cts {}".format( uni[cts>2], cts[cts>2]) #assert len(np.setdiff1d(val_loader.generator.dataset_pids, uni))==0, "not all val pids were shown." assert len(np.setdiff1d(uni, val_loader.generator.dataset_pids))==0, "pids shown that are not val set. impossible?" cf.n_workers = os.cpu_count() cf.batch_size = int(val_loader.generator.dataset_length / cf.n_workers) + 2 val_loader = dloader.create_data_gen_pipeline(cf, val_loader.generator._data, do_aug=False, sample_pids_w_replace=False, max_batches=None, raise_stop_iteration=True) for epoch in range(2): produced_ids = [] for b, batch in enumerate(val_loader): produced_ids.append(batch["pid"]) uni, cts = np.unique(np.concatenate(produced_ids), return_counts=True) assert np.all(cts == 1), "with batch size one: every item should occur exactly once.\n uni {}, cts {}".format( uni[cts>1], cts[cts>1]) assert len(np.setdiff1d(val_loader.generator.dataset_pids, uni))==0, "not all val pids were shown." assert len(np.setdiff1d(uni, val_loader.generator.dataset_pids))==0, "pids shown that are not val set. impossible?" pass if __name__=="__main__": stime = time.time() t = CheckRoIAlignImplementation() t.manual_check() #unittest.main() mins, secs = divmod((time.time() - stime), 60) h, mins = divmod(mins, 60) t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs)) print("{} total runtime: {}".format(os.path.split(__file__)[1], t)) \ No newline at end of file