diff --git a/evaluator.py b/evaluator.py index 8d41079..bc004f5 100644 --- a/evaluator.py +++ b/evaluator.py @@ -1,983 +1,993 @@ #!/usr/bin/env python # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os from multiprocessing import Pool import pickle import time import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.metrics import roc_curve, precision_recall_curve from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score import torch import utils.model_utils as mutils import plotting as plg import warnings def get_roi_ap_from_df(inputs): ''' :param df: data frame. :param det_thresh: min_threshold for filtering out low confidence predictions. :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results, instead of computing one ap over whole data set. :return: average_precision (float) ''' df, det_thresh, per_patient_ap = inputs if per_patient_ap: pids_list = df.pid.unique() aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] for pid in pids_list: pid_df = iou_df[iou_df.pid == pid] all_p = len(pid_df[pid_df.class_label == 1]) pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) pid_df = pid_df[pid_df.pred_score > det_thresh] if (len(pid_df) ==0 and all_p == 0): pass elif (len(pid_df) > 0 and all_p == 0): aps.append(0) else: aps.append(compute_roi_ap(pid_df, all_p)) return np.mean(aps) else: aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] # it's important to not apply the threshold before counting all_p in order to not lose the fn! all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')]) # sorting out all entries that are not fp or tp or have confidence(=pred_score) <= detection_threshold iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) iou_df = iou_df[iou_df.pred_score > det_thresh] if all_p>0: aps.append(compute_roi_ap(iou_df, all_p)) return np.mean(aps) def compute_roi_ap(df, all_p): """ adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score. :param all_p: number of all ground truth objects. (for denominator of recall.) :return: """ tp = df.class_label.values fp = (tp == 0) * 1 #recall thresholds, where precision will be measured R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True) tp_sum = np.cumsum(tp) fp_sum = np.cumsum(fp) n_dets = len(tp) rc = tp_sum / all_p pr = tp_sum / (fp_sum + tp_sum) # initialize precision array over recall steps (q=queries). q = [0. for _ in range(len(R))] # numpy is slow without cython optimization for accessing elements # python array gets significant speed improvement pr = pr.tolist() for i in range(n_dets - 1, 0, -1): if pr[i] > pr[i - 1]: pr[i - 1] = pr[i] #--> pr[i]<=pr[i-1] for all i since we want to consider the maximum #precision value for a queried interval # discretize empiric recall steps with given bins. assert np.all(rc[:-1]<=rc[1:]), "recall not sorted ascendingly" inds = np.searchsorted(rc, R, side='left') try: for rc_ix, pr_ix in enumerate(inds): q[rc_ix] = pr[pr_ix] except IndexError: #now q is filled with pr values up to first non-available index pass return np.mean(q) def roi_avp(inputs): ''' :param df: data frame. :param det_thresh: min_threshold for filtering out low confidence predictions. :param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results, instead of computing one ap over whole data set. :return: average_precision (float) ''' df, det_thresh, per_patient_ap = inputs if per_patient_ap: pids_list = df.pid.unique() aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] for pid in pids_list: pid_df = iou_df[iou_df.pid == pid] all_p = len(pid_df[pid_df.class_label == 1]) mask = ((pid_df.rg_bins == pid_df.rg_bin_target) & (pid_df.det_type == 'det_tp')) | (pid_df.det_type == 'det_fp') pid_df = pid_df[mask].sort_values('pred_score', ascending=False) pid_df = pid_df[pid_df.pred_score > det_thresh] if (len(pid_df) ==0 and all_p == 0): pass elif (len(pid_df) > 0 and all_p == 0): aps.append(0) else: aps.append(compute_roi_ap(pid_df, all_p)) return np.mean(aps) else: aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] #it's important to not apply the threshold before counting all_positives! all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')]) # filtering out tps which don't match rg_bin target at this point is same as reclassifying them as fn. # also sorting out all entries that are not fp or have confidence(=pred_score) <= detection_threshold mask = ((iou_df.rg_bins == iou_df.rg_bin_target) & (iou_df.det_type == 'det_tp')) | (iou_df.det_type == 'det_fp') iou_df = iou_df[mask].sort_values('pred_score', ascending=False) iou_df = iou_df[iou_df.pred_score > det_thresh] if all_p>0: aps.append(compute_roi_ap(iou_df, all_p)) return np.mean(aps) def compute_prc(df): """compute precision-recall curve with maximum precision per recall interval. :param df: :param all_p: # of all positive samples in data. :return: array: [precisions, recall query values] """ assert (df.class_label==1).any(), "cannot compute prc when no positives in data." all_p = len(df[(df.det_type == 'det_tp') | (df.det_type == 'det_fn')]) df = df[(df.det_type=="det_tp") | (df.det_type=="det_fp")] df = df.sort_values("pred_score", ascending=False) # recall thresholds, where precision will be measured scores = df.pred_score.values labels = df.class_label.values n_dets = len(scores) pr = np.zeros((n_dets,)) rc = pr.copy() for rank in range(n_dets): tp = np.count_nonzero(labels[:rank+1]==1) fp = np.count_nonzero(labels[:rank+1]==0) pr[rank] = tp/(tp+fp) rc[rank] = tp/all_p #after obj detection convention/ coco-dataset template: take maximum pr within intervals: # --> pr[i]<=pr[i-1] for all i since we want to consider the maximum # precision value for a queried interval for i in range(n_dets - 1, 0, -1): if pr[i] > pr[i - 1]: pr[i - 1] = pr[i] R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)#precision queried at R points inds = np.searchsorted(rc, R, side='left') queries = np.zeros((len(R),)) try: for q_ix, rank in enumerate(inds): queries[q_ix] = pr[rank] except IndexError: pass return np.array((queries, R)) def RMSE(y_true, y_pred, weights=None): if len(y_true)>0: return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=weights)) else: return np.nan def MAE_w_std(y_true, y_pred, weights=None): if len(y_true)>0: y_true, y_pred = np.array(y_true), np.array(y_pred) deltas = np.abs(y_true-y_pred) mae = np.average(deltas, weights=weights, axis=0).item() skmae = mean_absolute_error(y_true, y_pred, sample_weight=weights) assert np.allclose(mae, skmae, atol=1e-6), "mae {}, sklearn mae {}".format(mae, skmae) std = np.std(weights*deltas) return mae, std else: return np.nan, np.nan def MAE(y_true, y_pred, weights=None): if len(y_true)>0: return mean_absolute_error(y_true, y_pred, sample_weight=weights) else: return np.nan def accuracy(y_true, y_pred, weights=None): if len(y_true)>0: return accuracy_score(y_true, y_pred, sample_weight=weights) else: return np.nan # noinspection PyCallingNonCallable class Evaluator(): """ Evaluates given results dicts. Can return results as updated monitor_metrics. Can save test data frames to file. """ def __init__(self, cf, logger, mode='test'): """ :param mode: either 'train', 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms. """ self.cf = cf self.logger = logger self.mode = mode self.regress_flag = any(['regression' in task for task in self.cf.prediction_tasks]) self.plot_dir = self.cf.test_dir if self.mode == "test" else self.cf.plot_dir if self.cf.plot_prediction_histograms: self.hist_dir = os.path.join(self.plot_dir, 'histograms') os.makedirs(self.hist_dir, exist_ok=True) if self.cf.plot_stat_curves: self.curves_dir = os.path.join(self.plot_dir, 'stat_curves') os.makedirs(self.curves_dir, exist_ok=True) def eval_losses(self, batch_res_dicts): if hasattr(self.cf, "losses_to_monitor"): loss_names = self.cf.losses_to_monitor else: loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name} self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name in b_res_dict.keys()]).mean().item() for l_name in loss_names} def eval_segmentations(self, batch_res_dicts, pid_list): batch_dices = [b_res_dict['batch_dices'] for b_res_dict in batch_res_dicts if 'batch_dices' in b_res_dict.keys()] # shape (n_batches, n_seg_classes) if len(batch_dices) > 0: batch_dices = np.array(batch_dices) # dims n_batches x 1 in sampling / n_test_epochs x n_classes assert batch_dices.shape[1] == self.cf.num_seg_classes, "bdices shp {}, n seg cl {}, pid lst len {}".format( batch_dices.shape, self.cf.num_seg_classes, len(pid_list)) self.seg_df = pd.DataFrame() for seg_id in range(batch_dices.shape[1]): self.seg_df[self.cf.seg_id2label[seg_id].name + "_dice"] = batch_dices[:, seg_id] # one row== one batch, one column== one class # self.seg_df[self.cf.seg_id2label[seg_id].name+"_dice"] = np.concatenate(batch_dices[:,:,seg_id]) self.seg_df['fold'] = self.cf.fold if self.mode == "val_patient" or self.mode == "test": # need to make it more conform between sampling and patient-mode self.seg_df["pid"] = [pid for pix, pid in enumerate(pid_list)] # for b_inst in batch_inst_boxes[pix]] else: self.seg_df["pid"] = np.nan def eval_boxes(self, batch_res_dicts, pid_list, obj_cl_dict, obj_cl_identifiers={"gt":'class_targets', "pred":'box_pred_class_id'}): """ :param batch_res_dicts: :param pid_list: [pid_0, pid_1, ...] :return: """ if self.mode == 'train' or self.mode == 'val_sampling': # one pid per batch element # batch_size > 1, with varying patients across batch: # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [results_0, results_1, ..] batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # len: nr of batches in epoch batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in whole_batch_boxes] # len: batch instances of whole epoch assert np.all(len(b_boxes_list) == self.cf.batch_size for b_boxes_list in batch_inst_boxes) elif self.mode == "val_patient" or self.mode == "test": # patient processing, one element per batch = one patient. # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] # in patientbatchiterator there is only one pid per batch batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # in patient mode not actually per batch instance, but per whole batch! if hasattr(self.cf, "eval_test_separately") and self.cf.eval_test_separately: """ you could write your own routines to add GTs to raw predictions for evaluation. implemented standard is: cf.eval_test_separately = False or not set --> GTs are saved at same time and in same file as raw prediction results. """ raise NotImplementedError assert len(batch_inst_boxes) == len(pid_list) df_list_preds = [] df_list_labels = [] df_list_class_preds = [] df_list_pids = [] df_list_type = [] df_list_match_iou = [] df_list_n_missing = [] df_list_regressions = [] df_list_rg_targets = [] df_list_rg_bins = [] df_list_rg_bin_targets = [] df_list_rg_uncs = [] for match_iou in self.cf.ap_match_ious: self.logger.info('evaluating with ap_match_iou: {}'.format(match_iou)) for cl in list(obj_cl_dict.keys()): for pix, pid in enumerate(pid_list): len_df_list_before_patient = len(df_list_pids) # input of each batch element is a list of boxes, where each box is a dictionary. for b_inst_ix, b_boxes_list in enumerate(batch_inst_boxes[pix]): b_tar_boxes = [] b_cand_boxes, b_cand_scores, b_cand_n_missing = [], [], [] if self.regress_flag: b_tar_regs, b_tar_rg_bins = [], [] b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = [], [], [] for box in b_boxes_list: # each box is either gt or detection or proposal/anchor # we need all gts in the same order & all dets in same order if box['box_type'] == 'gt' and box[obj_cl_identifiers["gt"]] == cl: b_tar_boxes.append(box["box_coords"]) if self.regress_flag: b_tar_regs.append(np.array(box['regression_targets'], dtype='float32')) b_tar_rg_bins.append(box['rg_bin_targets']) if box['box_type'] == 'det' and box[obj_cl_identifiers["pred"]] == cl: b_cand_boxes.append(box["box_coords"]) b_cand_scores.append(box["box_score"]) b_cand_n_missing.append(box["cluster_n_missing"] if 'cluster_n_missing' in box.keys() else np.nan) if self.regress_flag: b_cand_regs.append(box["regression"]) b_cand_rg_bins.append(box["rg_bin"]) b_cand_rg_uncs.append(box["rg_uncertainty"] if 'rg_uncertainty' in box.keys() else np.nan) b_tar_boxes = np.array(b_tar_boxes) b_cand_boxes, b_cand_scores, b_cand_n_missing = np.array(b_cand_boxes), np.array(b_cand_scores), np.array(b_cand_n_missing) if self.regress_flag: b_tar_regs, b_tar_rg_bins = np.array(b_tar_regs), np.array(b_tar_rg_bins) b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = np.array(b_cand_regs), np.array(b_cand_rg_bins), np.array(b_cand_rg_uncs) # check if predictions and ground truth boxes exist and match them according to match_iou. if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: assert np.all(np.round(b_cand_scores,6) <= 1.), "there is a box score>1: {}".format(b_cand_scores[~(b_cand_scores<=1.)]) #coords_check = np.array([len(coords)==self.cf.dim*2 for coords in b_cand_boxes]) #assert np.all(coords_check), "cand box with wrong bcoords dim: {}, mode: {}".format(b_cand_boxes[~coords_check], self.mode) expected_dim = len(b_cand_boxes[0]) assert np.all([len(coords) == expected_dim for coords in b_tar_boxes]), \ "gt/cand box coords mismatch, expected dim: {}.".format(expected_dim) # overlaps: shape len(cand_boxes) x len(tar_boxes) overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes) # match_cand_ixs: shape (nr_of_matches,) # theses indices are the indices of b_cand_boxes match_cand_ixs = np.argwhere(np.max(overlaps, axis=1) > match_iou)[:, 0] non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0] # the corresponding gt assigned to the pred boxes by highest iou overlap, # i.e., match_gt_ixs holds index into b_tar_boxes for each entry in match_cand_ixs, # i.e., gt_ixs and cand_ixs are paired via their position in their list # (cand_ixs[j] corresponds to gt_ixs[j]) match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :], axis=1) if \ not 0 in match_cand_ixs.shape else np.array([]) assert len(match_gt_ixs)==len(match_cand_ixs) #match_gt_ixs: shape (nr_of_matches,) or 0 non_match_gt_ixs = np.array( [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs]) unique, counts = np.unique(match_gt_ixs, return_counts=True) # check for double assignments, i.e. two predictions having been assigned to the same gt. # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as # false positive. This case is supposed to be avoided by the model itself by, # e.g. using a low enough NMS threshold. if np.any(counts > 1): double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]] keep_max = [] double_match_list = [] for dg in double_match_gt_ixs: double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)] keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])]) double_match_list += [ii for ii in double_match_cand_ixs] fp_ixs = np.array([ii for ii in match_cand_ixs if (ii in double_match_list and ii not in keep_max)]) # count as fp: boxes that match gt above match_iou threshold but have not highest class confidence score match_gt_ixs = np.array([gt_ix for ii, gt_ix in enumerate(match_gt_ixs) if match_cand_ixs[ii] not in fp_ixs]) match_cand_ixs = np.array([cand_ix for cand_ix in match_cand_ixs if cand_ix not in fp_ixs]) assert len(match_gt_ixs) == len(match_cand_ixs) df_list_preds += [ii for ii in b_cand_scores[fp_ixs]] df_list_labels += [0] * fp_ixs.shape[0] # means label==gt==0==bg for all these fp_ixs df_list_class_preds += [cl] * fp_ixs.shape[0] df_list_n_missing += [n for n in b_cand_n_missing[fp_ixs]] if self.regress_flag: df_list_regressions += [r for r in b_cand_regs[fp_ixs]] df_list_rg_bins += [r for r in b_cand_rg_bins[fp_ixs]] df_list_rg_uncs += [r for r in b_cand_rg_uncs[fp_ixs]] df_list_rg_targets += [[0.]*self.cf.regression_n_features] * fp_ixs.shape[0] df_list_rg_bin_targets += [0.] * fp_ixs.shape[0] df_list_pids += [pid] * fp_ixs.shape[0] df_list_type += ['det_fp'] * fp_ixs.shape[0] # matched/tp: if not 0 in match_cand_ixs.shape: df_list_preds += list(b_cand_scores[match_cand_ixs]) df_list_labels += [1] * match_cand_ixs.shape[0] df_list_class_preds += [cl] * match_cand_ixs.shape[0] df_list_n_missing += list(b_cand_n_missing[match_cand_ixs]) if self.regress_flag: df_list_regressions += list(b_cand_regs[match_cand_ixs]) df_list_rg_bins += list(b_cand_rg_bins[match_cand_ixs]) df_list_rg_uncs += list(b_cand_rg_uncs[match_cand_ixs]) assert len(match_cand_ixs)==len(match_gt_ixs) df_list_rg_targets += list(b_tar_regs[match_gt_ixs]) df_list_rg_bin_targets += list(b_tar_rg_bins[match_gt_ixs]) df_list_pids += [pid] * match_cand_ixs.shape[0] df_list_type += ['det_tp'] * match_cand_ixs.shape[0] # rest fp: if not 0 in non_match_cand_ixs.shape: df_list_preds += list(b_cand_scores[non_match_cand_ixs]) df_list_labels += [0] * non_match_cand_ixs.shape[0] df_list_class_preds += [cl] * non_match_cand_ixs.shape[0] df_list_n_missing += list(b_cand_n_missing[non_match_cand_ixs]) if self.regress_flag: df_list_regressions += list(b_cand_regs[non_match_cand_ixs]) df_list_rg_bins += list(b_cand_rg_bins[non_match_cand_ixs]) df_list_rg_uncs += list(b_cand_rg_uncs[non_match_cand_ixs]) df_list_rg_targets += [[0.]*self.cf.regression_n_features] * non_match_cand_ixs.shape[0] df_list_rg_bin_targets += [0.] * non_match_cand_ixs.shape[0] df_list_pids += [pid] * non_match_cand_ixs.shape[0] df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0] # fn: if not 0 in non_match_gt_ixs.shape: df_list_preds += [0] * non_match_gt_ixs.shape[0] df_list_labels += [1] * non_match_gt_ixs.shape[0] df_list_class_preds += [cl] * non_match_gt_ixs.shape[0] df_list_n_missing += [np.nan] * non_match_gt_ixs.shape[0] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] * non_match_gt_ixs.shape[0] df_list_rg_bins += [0.] * non_match_gt_ixs.shape[0] df_list_rg_uncs += [np.nan] * non_match_gt_ixs.shape[0] df_list_rg_targets += list(b_tar_regs[non_match_gt_ixs]) df_list_rg_bin_targets += list(b_tar_rg_bins[non_match_gt_ixs]) df_list_pids += [pid] * non_match_gt_ixs.shape[0] df_list_type += ['det_fn'] * non_match_gt_ixs.shape[0] # only fp: if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape: # means there is no gt in all samples! any preds have to be fp. df_list_preds += list(b_cand_scores) df_list_labels += [0] * b_cand_boxes.shape[0] df_list_class_preds += [cl] * b_cand_boxes.shape[0] df_list_n_missing += list(b_cand_n_missing) if self.regress_flag: df_list_regressions += list(b_cand_regs) df_list_rg_bins += list(b_cand_rg_bins) df_list_rg_uncs += list(b_cand_rg_uncs) df_list_rg_targets += [[0.]*self.cf.regression_n_features] * b_cand_boxes.shape[0] df_list_rg_bin_targets += [0.] * b_cand_boxes.shape[0] df_list_pids += [pid] * b_cand_boxes.shape[0] df_list_type += ['det_fp'] * b_cand_boxes.shape[0] # only fn: if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: df_list_preds += [0] * b_tar_boxes.shape[0] df_list_labels += [1] * b_tar_boxes.shape[0] df_list_class_preds += [cl] * b_tar_boxes.shape[0] df_list_n_missing += [np.nan] * b_tar_boxes.shape[0] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] * b_tar_boxes.shape[0] df_list_rg_bins += [0.] * b_tar_boxes.shape[0] df_list_rg_uncs += [np.nan] * b_tar_boxes.shape[0] df_list_rg_targets += list(b_tar_regs) df_list_rg_bin_targets += list(b_tar_rg_bins) df_list_pids += [pid] * b_tar_boxes.shape[0] df_list_type += ['det_fn'] * b_tar_boxes.shape[0] # empty patient with 0 detections needs empty patient score, in order to not disappear from stats. # filtered out for roi-level evaluation later. During training (and val_sampling), # tn are assigned per sample independently of associated patients. # i.e., patient_tn is also meant as sample_tn if a list of samples is evaluated instead of whole patient if len(df_list_pids) == len_df_list_before_patient: df_list_preds += [0] df_list_labels += [0] df_list_class_preds += [cl] df_list_n_missing += [np.nan] if self.regress_flag: df_list_regressions += [[0.]*self.cf.regression_n_features] df_list_rg_bins += [0.] df_list_rg_uncs += [np.nan] df_list_rg_targets += [[0.]*self.cf.regression_n_features] df_list_rg_bin_targets += [0.] df_list_pids += [pid] df_list_type += ['patient_tn'] # true negative: no ground truth boxes, no detections. df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou)) self.test_df = pd.DataFrame() self.test_df['pred_score'] = df_list_preds self.test_df['class_label'] = df_list_labels # class labels are gt, 0,1, only indicate neg/pos (or bg/fg) remapped from all classes self.test_df['pred_class'] = df_list_class_preds # can be diff than 0,1 self.test_df['pid'] = df_list_pids self.test_df['det_type'] = df_list_type self.test_df['fold'] = self.cf.fold self.test_df['match_iou'] = df_list_match_iou self.test_df['cluster_n_missing'] = df_list_n_missing if self.regress_flag: self.test_df['regressions'] = df_list_regressions self.test_df['rg_targets'] = df_list_rg_targets self.test_df['rg_uncertainties'] = df_list_rg_uncs self.test_df['rg_bins'] = df_list_rg_bins # super weird error: pandas does not properly add an attribute if column is named "rg_bin_targets" ... ?!? self.test_df['rg_bin_target'] = df_list_rg_bin_targets assert hasattr(self.test_df, "rg_bin_target") #fn_df = self.test_df[self.test_df["det_type"] == "det_fn"] pass def evaluate_predictions(self, results_list, monitor_metrics=None): """ Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes. Resulting info of each prediction is stored as one line in an internal dataframe, with the keys: det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative) pred_class: foreground class which the object predicts. pid: corresponding patient-id. pred_score: confidence score [0, 1] fold: corresponding fold of CV. match_iou: utilized IoU for matching. :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form: [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...]) :param monitor_metrics (optional): dict of dicts with all metrics of previous epochs. :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch. """ # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches # we want to evaluate one batch_instance (= 2D or 3D image) at a time. self.logger.info('evaluating in mode {}'.format(self.mode)) batch_res_dicts = [batch[0] for batch in results_list] # len: nr of batches in epoch if self.mode == 'train' or self.mode=='val_sampling': # one pid per batch element # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [pid_0, pid_1, ...] # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]] elif self.mode == "val_patient" or self.mode=="test": # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...] # in patientbatchiterator there is only one pid per batch pid_list = [np.unique(batch[1]) for batch in results_list] assert np.all([len(pid)==1 for pid in pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(pid_list) pid_list = [pid[0] for pid in pid_list] else: raise Exception("undefined run mode encountered") self.eval_losses(batch_res_dicts) self.eval_segmentations(batch_res_dicts, pid_list) self.eval_boxes(batch_res_dicts, pid_list, self.cf.class_dict) if monitor_metrics is not None: # return all_stats, updated monitor_metrics return self.return_metrics(self.test_df, self.cf.class_dict, monitor_metrics) def return_metrics(self, df, obj_cl_dict, monitor_metrics=None, boxes_only=False): """ Calculates metric scores for internal data frame. Called directly from evaluate_predictions during training for monitoring, or from score_test_df during inference (for single folds or aggregated test set). Loops over foreground classes and score_levels ('roi' and/or 'patient'), gets scores and stores them. Optionally creates plots of prediction histograms and ROC/PR curves. :param df: Data frame that holds evaluated predictions. :param obj_cl_dict: Dict linking object-class ids to object-class names. E.g., {1: "bikes", 2 : "cars"}. Set in configs as cf.class_dict. :param monitor_metrics: dict of dicts with all metrics of previous epochs. This function adds metrics for current epoch and returns the same object. :param boxes_only: whether to produce metrics only for the boxes, not the segmentations. :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and score_level. :return: monitor_metrics """ # -------------- monitoring independent of class, score level ------------ if monitor_metrics is not None: for l_name in self.epoch_losses: monitor_metrics[l_name] = [self.epoch_losses[l_name]] # -------------- metrics calc dependent on class, score level ------------ all_stats = [] # all_stats: one entry per score_level per class for cl in list(obj_cl_dict.keys()): # bg eval is neglected cl_name = obj_cl_dict[cl] cl_df = df[df.pred_class == cl] if hasattr(self, "seg_df") and not boxes_only: dice_col = self.cf.seg_id2label[cl].name+"_dice" seg_cl_df = self.seg_df.loc[:,['pid', dice_col, 'fold']] for score_level in self.cf.report_score_level: stats_dict = {} stats_dict['name'] = 'fold_{} {} {}'.format(self.cf.fold, score_level, cl_name) # -------------- RoI-based ----------------- if score_level == 'rois': stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan if monitor_metrics is not None: tn = len(cl_df[cl_df.det_type == "patient_tn"]) tp = len(cl_df[(cl_df.det_type == "det_tp")&(cl_df.pred_score>self.cf.min_det_thresh)]) fp = len(cl_df[(cl_df.det_type == "det_fp")&(cl_df.pred_score>self.cf.min_det_thresh)]) fn = len(cl_df[cl_df.det_type == "det_fn"]) sens = np.divide(tp, (fn + tp)) monitor_metrics.update({"Bin_Stats/" + cl_name + "_fp": [fp], "Bin_Stats/" + cl_name + "_tp": [tp], "Bin_Stats/" + cl_name + "_fn": [fn], "Bin_Stats/" + cl_name + "_tn": [tn], "Bin_Stats/" + cl_name + "_sensitivity": [sens]}) # list wrapping only needed bc other metrics are recorded over all epochs; spec_df = cl_df[cl_df.det_type != 'patient_tn'] if self.regress_flag: # filter false negatives out for regression-only eval since regressor didn't predict truncd_df = spec_df[(((spec_df.det_type == "det_fp") | ( spec_df.det_type == "det_tp")) & spec_df.pred_score > self.cf.min_det_thresh)] truncd_df_tp = truncd_df[truncd_df.det_type == "det_tp"] weights, weights_tp = truncd_df.pred_score.tolist(), truncd_df_tp.pred_score.tolist() y_true, y_pred = truncd_df.rg_targets.tolist(), truncd_df.regressions.tolist() stats_dict["rg_RMSE"] = RMSE(y_true, y_pred) stats_dict["rg_MAE"] = MAE(y_true, y_pred) stats_dict["rg_RMSE_weighted"] = RMSE(y_true, y_pred, weights) stats_dict["rg_MAE_weighted"] = MAE(y_true, y_pred, weights) y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist() stats_dict["rg_MAE_weighted_tp"] = MAE(y_true, y_pred, weights_tp) stats_dict["rg_MAE_w_std_weighted_tp"] = MAE_w_std(y_true, y_pred, weights_tp) y_true, y_pred = truncd_df.rg_bin_target.tolist(), truncd_df.rg_bins.tolist() stats_dict["rg_bin_accuracy"] = accuracy(y_true, y_pred) stats_dict["rg_bin_accuracy_weighted"] = accuracy(y_true, y_pred, weights) y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist() stats_dict["rg_bin_accuracy_weighted_tp"] = accuracy(y_true, y_pred, weights_tp) if np.any(~truncd_df.rg_uncertainties.isna()): # det_fn are expected to be NaN so they drop out in means stats_dict.update({"rg_uncertainty": truncd_df.rg_uncertainties.mean(), "rg_uncertainty_tp": truncd_df_tp.rg_uncertainties.mean(), "rg_uncertainty_tp_weighted": (truncd_df_tp.rg_uncertainties * truncd_df_tp.pred_score).sum() / truncd_df_tp.pred_score.sum() }) if (spec_df.class_label==1).any(): stats_dict['ap'] = get_roi_ap_from_df((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap)) stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) if self.regress_flag: stats_dict['avp'] = roi_avp((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap)) else: stats_dict['ap'] = np.nan stats_dict['prc'] = np.nan stats_dict['avp'] = np.nan # np.nan is formattable by __format__ as a float, None-type is not if hasattr(self, "seg_df") and not boxes_only: stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() # mean per all rois in this epoch stats_dict["dice_std"] = seg_cl_df.loc[:,dice_col].std() # for the aggregated test set case, additionally get the scores of averaging over fold results. if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1: aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label==1).any(): aps.append(get_roi_ap_from_df((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap))) stats_dict['ap_folds_mean'] = np.mean(aps) if len(aps)>0 else np.nan stats_dict['ap_folds_std'] = np.std(aps) if len(aps)>0 else np.nan stats_dict['auc_folds_mean'] = np.nan stats_dict['auc_folds_std'] = np.nan if self.regress_flag: avps, accuracies, MAEs = [], [], [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label == 1).any(): avps.append(roi_avp((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap))) truncd_df_tp = fold_df[((fold_df.det_type == "det_tp") & fold_df.pred_score > self.cf.min_det_thresh)] weights_tp = truncd_df_tp.pred_score.tolist() y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist() accuracies.append(accuracy(y_true, y_pred, weights_tp)) y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist() MAEs.append(MAE_w_std(y_true, y_pred, weights_tp)) stats_dict['avp_folds_mean'] = np.mean(avps) if len(avps) > 0 else np.nan stats_dict['avp_folds_std'] = np.std(avps) if len(avps) > 0 else np.nan stats_dict['rg_bin_accuracy_weighted_tp_folds_mean'] = np.mean(accuracies) if len(accuracies) > 0 else np.nan stats_dict['rg_bin_accuracy_weighted_tp_folds_std'] = np.std(accuracies) if len(accuracies) > 0 else np.nan stats_dict['rg_MAE_w_std_weighted_tp_folds_mean'] = np.mean(MAEs, axis=0) if len(MAEs) > 0 else np.nan stats_dict['rg_MAE_w_std_weighted_tp_folds_std'] = np.std(MAEs, axis=0) if len(MAEs) > 0 else np.nan if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1: fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"}) stats_dict["dice_folds_mean"] = float(fold_means.mean()) stats_dict["dice_folds_std"] = float(fold_means.std()) # -------------- patient-based ----------------- # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0. if score_level == 'patient': #this is the critical part in patient scoring: only the max gt and max pred score are taken per patient! #--> does mix up values from separate detections spec_df = cl_df.groupby(['pid'], as_index=False) agg_args = {'class_label': 'max', 'pred_score': 'max', 'fold': 'first'} if self.regress_flag: # pandas throws error if aggregated value is np.array, not if is list. agg_args.update({'regressions': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]), 'rg_targets': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]), 'rg_bins': 'max', 'rg_bin_target': 'max', 'rg_uncertainties': 'max' }) if hasattr(cl_df, "cluster_n_missing"): agg_args.update({'cluster_n_missing': 'mean'}) spec_df = spec_df.agg(agg_args) if len(spec_df.class_label.unique()) > 1: stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) else: stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan if (spec_df.class_label == 1).any(): patient_cl_labels = spec_df.class_label.tolist() stats_dict['ap'] = average_precision_score(patient_cl_labels, spec_df.pred_score.tolist()) stats_dict['prc'] = precision_recall_curve(patient_cl_labels, spec_df.pred_score.tolist()) if self.regress_flag: avp_scores = spec_df[spec_df.rg_bins == spec_df.rg_bin_target].pred_score.tolist() avp_scores += [0.] * (len(patient_cl_labels) - len(avp_scores)) stats_dict['avp'] = average_precision_score(patient_cl_labels, avp_scores) else: stats_dict['ap'] = np.nan stats_dict['prc'] = np.nan stats_dict['avp'] = np.nan if self.regress_flag: y_true, y_pred = spec_df.rg_targets.tolist(), spec_df.regressions.tolist() stats_dict["rg_RMSE"] = RMSE(y_true, y_pred) stats_dict["rg_MAE"] = MAE(y_true, y_pred) stats_dict["rg_bin_accuracy"] = accuracy(spec_df.rg_bin_target.tolist(), spec_df.rg_bins.tolist()) stats_dict["rg_uncertainty"] = spec_df.rg_uncertainties.mean() if hasattr(self, "seg_df") and not boxes_only: seg_cl_df = seg_cl_df.groupby(['pid'], as_index=False).agg( {dice_col: "mean", "fold": "first"}) # mean of all rois per patient in this epoch stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() #mean of all patients stats_dict["dice_std"] = seg_cl_df.loc[:, dice_col].std() # for the aggregated test set case, additionally get the scores for averaging over fold results. if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1 and self.mode in ["test", "analysis"]: aucs = [] aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if (fold_df.class_label==1).any(): aps.append( average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) if len(fold_df.class_label.unique())>1: aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) stats_dict['auc_folds_mean'] = np.mean(aucs) stats_dict['auc_folds_std'] = np.std(aucs) stats_dict['ap_folds_mean'] = np.mean(aps) stats_dict['ap_folds_std'] = np.std(aps) if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1: fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"}) stats_dict["dice_folds_mean"] = float(fold_means.mean()) stats_dict["dice_folds_std"] = float(fold_means.std()) all_stats.append(stats_dict) # -------------- monitoring, visualisation ----------------- # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level. patient_interests = [self.cf.class_dict[self.cf.patient_class_of_interest],] if hasattr(self.cf, "bin_dict"): patient_interests += [self.cf.bin_dict[self.cf.patient_bin_of_interest]] if monitor_metrics is not None and (score_level != 'patient' or cl_name in patient_interests): name = 'patient_'+cl_name if score_level == 'patient' else cl_name for metric in self.cf.metrics: if metric in stats_dict.keys(): monitor_metrics[name + '_'+metric].append(stats_dict[metric]) else: print("WARNING: skipped monitor metric {}_{} since not avail".format(name, metric)) # histograms if self.cf.plot_prediction_histograms: out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_{}'.format( self.cf.fold, self.mode, score_level, cl_name)) plg.plot_prediction_hist(self.cf, spec_df, out_filename) # analysis of the hyper-parameter cf.min_det_thresh, for optimization on validation set. if self.cf.scan_det_thresh and "val" in self.mode: conf_threshs = list(np.arange(0.8, 1, 0.02)) pool = Pool(processes=self.cf.n_workers) mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs] aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1) pool.close() pool.join() self.logger.info('results from scanning over det_threshs: {}'.format([[i, j] for i, j in zip(conf_threshs, aps)])) class_means = pd.DataFrame(columns=self.cf.report_score_level) for slevel in self.cf.report_score_level: level_stats = pd.DataFrame([stats for stats in all_stats if slevel in stats["name"]])[self.cf.metrics] class_means.loc[:, slevel] = level_stats.mean() all_stats.extend([{"name": 'fold_{} {} {}'.format(self.cf.fold, slevel, "class_means"), **level_means} for slevel, level_means in class_means.to_dict().items()]) if self.cf.plot_stat_curves: out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode)) plg.plot_stat_curves(self.cf, all_stats, out_filename) if self.cf.plot_prediction_histograms and hasattr(df, "cluster_n_missing") and df.cluster_n_missing.notna().any(): out_filename = os.path.join(self.hist_dir, 'n_missing_hist_{}_{}.png'.format(self.cf.fold, self.mode)) plg.plot_wbc_n_missing(self.cf, df, outfile=out_filename) return all_stats, monitor_metrics + def write_to_results_table(self, stats, metrics_to_score): + """Write overall results to a common inter-experiment table. + :param metrics_to_score: + :return: + """ + results_table_path = os.path.join(self.cf.test_dir, "../../", 'results_table.csv') + with open(results_table_path, 'a') as handle: + # ---column headers--- + handle.write('\n{},'.format("Experiment Name")) + handle.write('{},'.format("Time Stamp")) + handle.write('{},'.format("Samples Seen")) + handle.write('{},'.format("Spatial Dim")) + handle.write('{},'.format("Patch Size")) + handle.write('{},'.format("CV Folds")) + handle.write('{},'.format("{}-clustering IoU".format(self.cf.clustering))) + handle.write('{},'.format("Merge-2D-to-3D IoU")) + if hasattr(self.cf, "test_against_exact_gt"): + handle.write('{},'.format('Exact GT')) + for s in stats: + if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]: + for metric in metrics_to_score: + if metric in s.keys() and not np.isnan(s[metric]): + if metric == 'ap': + handle.write('{}_{} : {}_{},'.format(*s['name'].split(" ")[1:], metric, + int(np.mean(self.cf.ap_match_ious) * 100))) + elif not "folds_std" in metric: + handle.write('{}_{} : {},'.format(*s['name'].split(" ")[1:], metric)) + else: + print("WARNING: skipped metric {} since not avail".format(metric)) + handle.write('\n') + + # --- columns content--- + handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1])) + handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S"))) + handle.write('{},'.format(self.cf.num_epochs * self.cf.num_train_batches * self.cf.batch_size)) + handle.write('{}D,'.format(self.cf.dim)) + handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)]))) + handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", ""))) + handle.write('{},'.format(self.cf.clustering_iou if self.cf.clustering else str("N/A"))) + handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A"))) + if hasattr(self.cf, "test_against_exact_gt"): + handle.write('{},'.format(self.cf.test_against_exact_gt)) + for s in stats: + if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]: + for metric in metrics_to_score: + if metric in s.keys() and not np.isnan( + s[metric]): # needed as long as no dice on patient level possible + if "folds_mean" in metric: + handle.write('{:0.3f}\u00B1{:0.3f}, '.format(s[metric], + s["_".join( + (*metric.split("_")[:-1], "std"))])) + elif not "folds_std" in metric: + handle.write('{:0.3f}, '.format(s[metric])) + + handle.write('\n') def score_test_df(self, max_fold=None, internal_df=True): """ Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold, gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds, and calculates and writes out overall metrics. """ # this should maybe be extended to auc, ap stds. metrics_to_score = self.cf.metrics.copy() # + [ m+ext for m in self.cf.metrics if "dice" in m for ext in ["_std"]] if internal_df: self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pkl'.format(self.cf.fold))) if hasattr(self, "seg_df"): self.seg_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_seg_df.pkl'.format(self.cf.fold))) stats, _ = self.return_metrics(self.test_df, self.cf.class_dict) with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: handle.write('\n****************************\n') handle.write('\nresults for fold {}, {} \n'.format(self.cf.fold, time.strftime("%d/%m/%y %H:%M:%S"))) handle.write('\n****************************\n') handle.write('\nfold df shape {}\n \n'.format(self.test_df.shape)) for s in stats: for metric in metrics_to_score: if metric in s.keys(): #needed as long as no dice on patient level poss if "accuracy" in metric: handle.write('{} {:0.4f} '.format(metric, s[metric])) else: handle.write('{} {:0.3f} '.format(metric, s[metric])) else: print("WARNING: skipped metric {} since not avail".format(metric)) handle.write('{} \n'.format(s['name'])) if max_fold is None: max_fold = self.cf.n_cv_splits-1 if self.cf.fold == max_fold: print("max fold/overall stats triggered") self.cf.fold = 'overall' if self.cf.evaluate_fold_means: metrics_to_score += [m + ext for m in self.cf.metrics for ext in ("_folds_mean", "_folds_std")] if not self.cf.hold_out_test_set or not self.cf.ensemble_folds: - fold_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_df.pkl' in ii]) - fold_seg_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) if 'test_seg_df.pkl' in ii]) + fold_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) + if 'test_df.pkl' in ii and not "overall" in ii]) + fold_seg_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir) + if 'test_seg_df.pkl' in ii and not "overall" in ii]) for paths in [fold_df_paths, fold_seg_df_paths]: assert len(paths) <= self.cf.n_cv_splits, "found {} > nr of cv splits results dfs in {}".format( len(paths), self.cf.test_dir) with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: - - dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths] seg_dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_seg_df_paths] self.test_df = pd.concat(dfs_list, sort=True) if len(seg_dfs_list)>0: self.seg_df = pd.concat(seg_dfs_list, sort=True) stats, _ = self.return_metrics(self.test_df, self.cf.class_dict) handle.write('\n****************************\n') handle.write('\nOVERALL RESULTS \n') handle.write('\n****************************\n') handle.write('\ndf shape \n \n'.format(self.test_df.shape)) for s in stats: for metric in metrics_to_score: if metric in s.keys(): handle.write('{} {:0.3f} '.format(metric, s[metric])) handle.write('{} \n'.format(s['name'])) - results_table_path = os.path.join(self.cf.test_dir,"../../", 'results_table.csv') - with open(results_table_path, 'a') as handle: - #---column headers--- - handle.write('\n{},'.format("Experiment Name")) - handle.write('{},'.format("Time Stamp")) - handle.write('{},'.format("Samples Seen")) - handle.write('{},'.format("Spatial Dim")) - handle.write('{},'.format("Patch Size")) - handle.write('{},'.format("CV Folds")) - handle.write('{},'.format("{}-clustering IoU".format(self.cf.clustering))) - handle.write('{},'.format("Merge-2D-to-3D IoU")) - if hasattr(self.cf, "test_against_exact_gt"): - handle.write('{},'.format('Exact GT')) - for s in stats: - if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]: - for metric in metrics_to_score: - if metric in s.keys() and not np.isnan(s[metric]): - if metric=='ap': - handle.write('{}_{} : {}_{},'.format(*s['name'].split(" ")[1:], metric, int(np.mean(self.cf.ap_match_ious)*100))) - elif not "folds_std" in metric: - handle.write('{}_{} : {},'.format(*s['name'].split(" ")[1:], metric)) - else: - print("WARNING: skipped metric {} since not avail".format(metric)) - handle.write('\n') - - #--- columns content--- - handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1])) - handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S"))) - handle.write('{},'.format(self.cf.num_epochs*self.cf.num_train_batches*self.cf.batch_size)) - handle.write('{}D,'.format(self.cf.dim)) - handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)]))) - handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", ""))) - handle.write('{},'.format(self.cf.clustering_iou if self.cf.clustering else str("N/A"))) - handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A"))) - if hasattr(self.cf, "test_against_exact_gt"): - handle.write('{},'.format(self.cf.test_against_exact_gt)) - for s in stats: - if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]: - for metric in metrics_to_score: - if metric in s.keys() and not np.isnan(s[metric]): # needed as long as no dice on patient level possible - if "folds_mean" in metric: - handle.write('{:0.3f}\u00B1{:0.3f}, '.format(s[metric], s["_".join((*metric.split("_")[:-1], "std"))])) - elif not "folds_std" in metric: - handle.write('{:0.3f}, '.format(s[metric])) - - handle.write('\n') + self.write_to_results_table(stats, metrics_to_score) with open(os.path.join(self.cf.test_dir, 'results_extr_scores.txt'), 'w') as handle: handle.write('\n****************************\n') handle.write('\nextremal scores for fold {} \n'.format(self.cf.fold)) handle.write('\n****************************\n') # want: pid & fold (&other) of highest scoring tp & fp in test_df for cl in self.cf.class_dict.keys(): print("\nClass {}".format(self.cf.class_dict[cl]), file=handle) cl_df = self.test_df[self.test_df.pred_class == cl] #.dropna(axis=1) for det_type in ['det_tp', 'det_fp']: filtered_df = cl_df[cl_df.det_type==det_type] print("\nHighest scoring {} of class {}".format(det_type, self.cf.class_dict[cl]), file=handle) if len(filtered_df)>0: print(filtered_df.loc[filtered_df.pred_score.idxmax()], file=handle) else: print("No detections of type {} for class {} in this df".format(det_type, self.cf.class_dict[cl]), file=handle) handle.write('\n****************************\n') diff --git a/utils/exp_utils.py b/utils/exp_utils.py index fc22592..698e1d0 100644 --- a/utils/exp_utils.py +++ b/utils/exp_utils.py @@ -1,727 +1,727 @@ #!/usr/bin/env python # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== from typing import Union, Iterable import sys import os import subprocess from multiprocessing import Process import threading import pickle import importlib.util import psutil import time import nvidia_smi import logging from torch.utils.tensorboard import SummaryWriter from collections import OrderedDict import numpy as np import pandas as pd import torch def import_module(name, path): """ correct way of importing a module dynamically in python 3. :param name: name given to module instance. :param path: path to module. :return: module: returned module instance. """ spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module def save_obj(obj, name): """Pickle a python object.""" with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(file_path): with open(file_path, 'rb') as handle: return pickle.load(handle) def IO_safe(func, *args, _tries=5, _raise=True, **kwargs): """ Wrapper calling function func with arguments args and keyword arguments kwargs to catch input/output errors on cluster. :param func: function to execute (intended to be read/write operation to a problematic cluster drive, but can be any function). :param args: positional args of func. :param kwargs: kw args of func. :param _tries: how many attempts to make executing func. """ for _try in range(_tries): try: return func(*args, **kwargs) except OSError as e: # to catch cluster issues with network drives if _raise: raise e else: print("After attempting execution {} time{}, following error occurred:\n{}".format(_try + 1, "" if _try == 0 else "s", e)) continue def split_off_process(target, *args, daemon=False, **kwargs): """Start a process that won't block parent script. No join(), no return value. If daemon=False: before parent exits, it waits for this to finish. """ p = Process(target=target, args=tuple(args), kwargs=kwargs, daemon=daemon) p.start() return p def query_nvidia_gpu(device_id, d_keyword=None, no_units=False): """ :param device_id: :param d_keyword: -d, --display argument (keyword(s) for selective display), all are selected if None :return: dict of gpu-info items """ cmd = ['nvidia-smi', '-i', str(device_id), '-q'] if d_keyword is not None: cmd += ['-d', d_keyword] outp = subprocess.check_output(cmd).strip().decode('utf-8').split("\n") outp = [x for x in outp if len(x) > 0] headers = [ix for ix, item in enumerate(outp) if len(item.split(":")) == 1] + [len(outp)] out_dict = {} for lix, hix in enumerate(headers[:-1]): head = outp[hix].strip().replace(" ", "_").lower() out_dict[head] = {} for lix2 in range(hix, headers[lix + 1]): try: key, val = [x.strip().lower() for x in outp[lix2].split(":")] if no_units: val = val.split()[0] out_dict[head][key] = val except: pass return out_dict class _AnsiColorizer(object): """ A colorizer is an object that loosely wraps around a stream, allowing callers to write text to the stream in a particular color. Colorizer classes must implement C{supported()} and C{write(text, color)}. """ _colors = dict(black=30, red=31, green=32, yellow=33, blue=34, magenta=35, cyan=36, white=37, default=39) def __init__(self, stream): self.stream = stream @classmethod def supported(cls, stream=sys.stdout): """ A class method that returns True if the current platform supports coloring terminal output using this method. Returns False otherwise. """ if not stream.isatty(): return False # auto color only on TTYs try: import curses except ImportError: return False else: try: try: return curses.tigetnum("colors") > 2 except curses.error: curses.setupterm() return curses.tigetnum("colors") > 2 except: raise # guess false in case of error return False def write(self, text, color): """ Write the given text to the stream in the given color. @param text: Text to be written to the stream. @param color: A string label for a color. e.g. 'red', 'white'. """ color = self._colors[color] self.stream.write('\x1b[%sm%s\x1b[0m' % (color, text)) class ColorHandler(logging.StreamHandler): def __init__(self, stream=sys.stdout): super(ColorHandler, self).__init__(_AnsiColorizer(stream)) def emit(self, record): msg_colors = { logging.DEBUG: "green", logging.INFO: "default", logging.WARNING: "red", logging.ERROR: "red" } color = msg_colors.get(record.levelno, "blue") self.stream.write(record.msg + "\n", color) class CombinedPrinter(object): """combined print function. prints to logger and/or file if given, to normal print if non given. """ def __init__(self, logger=None, file=None): if logger is None and file is None: self.out = [print] elif logger is None: self.out = [file.write] elif file is None: self.out = [logger.info] else: self.out = [logger.info, file.write] def __call__(self, string): for fct in self.out: fct(string) class Nvidia_GPU_Logger(object): def __init__(self): self.count = None def get_vals(self): # cmd = ['nvidia-settings', '-t', '-q', 'GPUUtilization'] # gpu_util = subprocess.check_output(cmd).strip().decode('utf-8').split(",") # gpu_util = dict([f.strip().split("=") for f in gpu_util]) # cmd[-1] = 'UsedDedicatedGPUMemory' # gpu_used_mem = subprocess.check_output(cmd).strip().decode('utf-8') nvidia_smi.nvmlInit() # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate self.gpu_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) util_res = nvidia_smi.nvmlDeviceGetUtilizationRates(self.gpu_handle) #mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(self.gpu_handle) # current_vals = {"gpu_mem_alloc": mem_res.used / (1024**2), "gpu_graphics_util": int(gpu_util['graphics']), # "gpu_mem_util": gpu_util['memory'], "time": time.time()} current_vals = {"gpu_graphics_util": float(util_res.gpu), "time": time.time()} return current_vals def loop(self, interval): i = 0 while True: current_vals = self.get_vals() self.log["time"].append(time.time()) self.log["gpu_util"].append(current_vals["gpu_graphics_util"]) if self.count is not None: i += 1 if i == self.count: exit(0) time.sleep(self.interval) def start(self, interval=1.): self.interval = interval self.start_time = time.time() self.log = {"time": [], "gpu_util": []} if self.interval is not None: thread = threading.Thread(target=self.loop) thread.daemon = True thread.start() class CombinedLogger(object): """Combine console and tensorboard logger and record system metrics. """ def __init__(self, name, log_dir, server_env=True, fold="all", sysmetrics_interval=2): self.pylogger = logging.getLogger(name) self.tboard = SummaryWriter(log_dir=os.path.join(log_dir, "tboard")) self.times = {} self.log_dir = log_dir self.fold = str(fold) self.server_env = server_env self.pylogger.setLevel(logging.DEBUG) self.log_file = os.path.join(log_dir, "fold_"+self.fold, 'exec.log') os.makedirs(os.path.dirname(self.log_file), exist_ok=True) self.pylogger.addHandler(logging.FileHandler(self.log_file)) if not server_env: self.pylogger.addHandler(ColorHandler()) else: self.pylogger.addHandler(logging.StreamHandler()) self.pylogger.propagate = False # monitor system metrics (cpu, mem, ...) if not server_env and sysmetrics_interval > 0: self.sysmetrics = pd.DataFrame( columns=["global_step", "rel_time", r"CPU (%)", "mem_used (GB)", r"mem_used (%)", r"swap_used (GB)", r"gpu_utilization (%)"], dtype="float16") for device in range(torch.cuda.device_count()): self.sysmetrics[ "mem_allocd (GB) by torch on {:10s}".format(torch.cuda.get_device_name(device))] = np.nan self.sysmetrics[ "mem_cached (GB) by torch on {:10s}".format(torch.cuda.get_device_name(device))] = np.nan self.sysmetrics_start(sysmetrics_interval) pass else: print("NOT logging sysmetrics") def __getattr__(self, attr): """delegate all undefined method requests to objects of this class in order pylogger, tboard (first find first serve). E.g., combinedlogger.add_scalars(...) should trigger self.tboard.add_scalars(...) """ for obj in [self.pylogger, self.tboard]: if attr in dir(obj): return getattr(obj, attr) print("logger attr not found") #raise AttributeError("CombinedLogger has no attribute {}".format(attr)) def set_logfile(self, fold=None, log_file=None): if fold is not None: self.fold = str(fold) if log_file is None: self.log_file = os.path.join(self.log_dir, "fold_"+self.fold, 'exec.log') else: self.log_file = log_file os.makedirs(os.path.dirname(self.log_file), exist_ok=True) for hdlr in self.pylogger.handlers: hdlr.close() self.pylogger.handlers = [] self.pylogger.addHandler(logging.FileHandler(self.log_file)) if not self.server_env: self.pylogger.addHandler(ColorHandler()) else: self.pylogger.addHandler(logging.StreamHandler()) def time(self, name, toggle=None): """record time-spans as with a stopwatch. :param name: :param toggle: True^=On: start time recording, False^=Off: halt rec. if None determine from current status. :return: either start-time or last recorded interval """ if toggle is None: if name in self.times.keys(): toggle = not self.times[name]["toggle"] else: toggle = True if toggle: if not name in self.times.keys(): self.times[name] = {"total": 0, "last": 0} elif self.times[name]["toggle"] == toggle: self.info("restarting running stopwatch") self.times[name]["last"] = time.time() self.times[name]["toggle"] = toggle return time.time() else: if toggle == self.times[name]["toggle"]: self.info("WARNING: tried to stop stopped stop watch: {}.".format(name)) self.times[name]["last"] = time.time() - self.times[name]["last"] self.times[name]["total"] += self.times[name]["last"] self.times[name]["toggle"] = toggle return self.times[name]["last"] def get_time(self, name=None, kind="total", format=None, reset=False): """ :param name: :param kind: 'total' or 'last' :param format: None for float, "hms"/"ms" for (hours), mins, secs as string :param reset: reset time after retrieving :return: """ if name is None: times = self.times if reset: self.reset_time() return times else: if self.times[name]["toggle"]: self.time(name, toggle=False) time = self.times[name][kind] if format == "hms": m, s = divmod(time, 60) h, m = divmod(m, 60) time = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(m), int(s)) elif format == "ms": m, s = divmod(time, 60) time = "{:02d}m:{:02d}s".format(int(m), int(s)) if reset: self.reset_time(name) return time def reset_time(self, name=None): if name is None: self.times = {} else: del self.times[name] def sysmetrics_update(self, global_step=None): if global_step is None: global_step = time.strftime("%x_%X") mem = psutil.virtual_memory() mem_used = (mem.total - mem.available) gpu_vals = self.gpu_logger.get_vals() rel_time = time.time() - self.sysmetrics_start_time self.sysmetrics.loc[len(self.sysmetrics)] = [global_step, rel_time, psutil.cpu_percent(), mem_used / 1024 ** 3, mem_used / mem.total * 100, psutil.swap_memory().used / 1024 ** 3, int(gpu_vals['gpu_graphics_util']), *[torch.cuda.memory_allocated(d) / 1024 ** 3 for d in range(torch.cuda.device_count())], *[torch.cuda.memory_cached(d) / 1024 ** 3 for d in range(torch.cuda.device_count())] ] return self.sysmetrics.loc[len(self.sysmetrics) - 1].to_dict() def sysmetrics2tboard(self, metrics=None, global_step=None, suptitle=None): tag = "per_time" if metrics is None: metrics = self.sysmetrics_update(global_step=global_step) tag = "per_epoch" if suptitle is not None: suptitle = str(suptitle) elif self.fold != "": suptitle = "Fold_" + str(self.fold) if suptitle is not None: self.tboard.add_scalars(suptitle + "/System_Metrics/" + tag, {k: v for (k, v) in metrics.items() if (k != "global_step" and k != "rel_time")}, global_step) def sysmetrics_loop(self): try: os.nice(-19) self.info("Logging system metrics with superior process priority.") except: self.info("Logging system metrics without superior process priority.") while True: metrics = self.sysmetrics_update() self.sysmetrics2tboard(metrics, global_step=metrics["rel_time"]) # print("thread alive", self.thread.is_alive()) time.sleep(self.sysmetrics_interval) def sysmetrics_start(self, interval): if interval is not None and interval > 0: self.sysmetrics_interval = interval self.gpu_logger = Nvidia_GPU_Logger() self.sysmetrics_start_time = time.time() self.sys_metrics_process = split_off_process(target=self.sysmetrics_loop, daemon=True) # self.thread = threading.Thread(target=self.sysmetrics_loop) # self.thread.daemon = True # self.thread.start() def sysmetrics_save(self, out_file): self.sysmetrics.to_pickle(out_file) def metrics2tboard(self, metrics, global_step=None, suptitle=None): """ :param metrics: {'train': dataframe, 'val':df}, df as produced in evaluator.py.evaluate_predictions """ # print("metrics", metrics) if global_step is None: global_step = len(metrics['train'][list(metrics['train'].keys())[0]]) - 1 if suptitle is not None: suptitle = str(suptitle) else: suptitle = "Fold_" + str(self.fold) for key in ['train', 'val']: # series = {k:np.array(v[-1]) for (k,v) in metrics[key].items() if not np.isnan(v[-1]) and not 'Bin_Stats' in k} loss_series = {} unc_series = {} bin_stat_series = {} mon_met_series = {} for tag, val in metrics[key].items(): val = val[-1] # maybe remove list wrapping, recording in evaluator? if 'bin_stats' in tag.lower() and not np.isnan(val): bin_stat_series["{}".format(tag.split("/")[-1])] = val elif 'uncertainty' in tag.lower() and not np.isnan(val): unc_series["{}".format(tag)] = val elif 'loss' in tag.lower() and not np.isnan(val): loss_series["{}".format(tag)] = val elif not np.isnan(val): mon_met_series["{}".format(tag)] = val self.tboard.add_scalars(suptitle + "/Binary_Statistics/{}".format(key), bin_stat_series, global_step) self.tboard.add_scalars(suptitle + "/Uncertainties/{}".format(key), unc_series, global_step) self.tboard.add_scalars(suptitle + "/Losses/{}".format(key), loss_series, global_step) self.tboard.add_scalars(suptitle + "/Monitor_Metrics/{}".format(key), mon_met_series, global_step) self.tboard.add_scalars(suptitle + "/Learning_Rate", metrics["lr"], global_step) return def batchImgs2tboard(self, batch, results_dict, cmap, boxtype2color, img_bg=False, global_step=None): raise NotImplementedError("not up-to-date, problem with importing plotting-file, torchvision dependency.") if len(batch["seg"].shape) == 5: # 3D imgs slice_ix = np.random.randint(batch["seg"].shape[-1]) seg_gt = plg.to_rgb(batch['seg'][:, 0, :, :, slice_ix], cmap) seg_pred = plg.to_rgb(results_dict['seg_preds'][:, 0, :, :, slice_ix], cmap) mod_img = plg.mod_to_rgb(batch["data"][:, 0, :, :, slice_ix]) if img_bg else None elif len(batch["seg"].shape) == 4: seg_gt = plg.to_rgb(batch['seg'][:, 0, :, :], cmap) seg_pred = plg.to_rgb(results_dict['seg_preds'][:, 0, :, :], cmap) mod_img = plg.mod_to_rgb(batch["data"][:, 0]) if img_bg else None else: raise Exception("batch content has wrong format: {}".format(batch["seg"].shape)) # from here on only works in 2D seg_gt = np.transpose(seg_gt, axes=(0, 3, 1, 2)) # previous shp: b,x,y,c seg_pred = np.transpose(seg_pred, axes=(0, 3, 1, 2)) seg = np.concatenate((seg_gt, seg_pred), axis=0) # todo replace torchvision (tv) dependency seg = tv.utils.make_grid(torch.from_numpy(seg), nrow=2) self.tboard.add_image("Batch seg, 1st col: gt, 2nd: pred.", seg, global_step=global_step) if img_bg: bg_img = np.transpose(mod_img, axes=(0, 3, 1, 2)) else: bg_img = seg_gt box_imgs = plg.draw_boxes_into_batch(bg_img, results_dict["boxes"], boxtype2color) box_imgs = tv.utils.make_grid(torch.from_numpy(box_imgs), nrow=4) self.tboard.add_image("Batch bboxes", box_imgs, global_step=global_step) return def __del__(self): # otherwise might produce multiple prints e.g. in ipython console #self.sys_metrics_process.terminate() for hdlr in self.pylogger.handlers: hdlr.close() self.pylogger.handlers = [] del self.pylogger self.tboard.flush() # close holds up main script exit. maybe revise this issue with a later pytorch version. #self.tboard.close() def get_logger(exp_dir, server_env=False, sysmetrics_interval=2): log_dir = os.path.join(exp_dir, "logs") logger = CombinedLogger('Reg R-CNN', log_dir, server_env=server_env, sysmetrics_interval=sysmetrics_interval) print("logging to {}".format(logger.log_file)) return logger def prepare_monitoring(cf): """ creates dictionaries, where train/val metrics are stored. """ metrics = {} # first entry for loss dict accounts for epoch starting at 1. metrics['train'] = OrderedDict() # [(l_name, [np.nan]) for l_name in cf.losses_to_monitor] ) metrics['val'] = OrderedDict() # [(l_name, [np.nan]) for l_name in cf.losses_to_monitor] ) metric_classes = [] if 'rois' in cf.report_score_level: metric_classes.extend([v for k, v in cf.class_dict.items()]) if hasattr(cf, "eval_bins_separately") and cf.eval_bins_separately: metric_classes.extend([v for k, v in cf.bin_dict.items()]) if 'patient' in cf.report_score_level: metric_classes.extend(['patient_' + cf.class_dict[cf.patient_class_of_interest]]) if hasattr(cf, "eval_bins_separately") and cf.eval_bins_separately: metric_classes.extend(['patient_' + cf.bin_dict[cf.patient_bin_of_interest]]) for cl in metric_classes: for m in cf.metrics: metrics['train'][cl + '_' + m] = [np.nan] metrics['val'][cl + '_' + m] = [np.nan] return metrics class ModelSelector: ''' saves a checkpoint after each epoch as 'last_state' (can be loaded to continue interrupted training). saves the top-k (k=cf.save_n_models) ranked epochs. In inference, predictions of multiple epochs can be ensembled to improve performance. ''' def __init__(self, cf, logger): self.cf = cf self.logger = logger self.model_index = pd.DataFrame(columns=["rank", "score", "criteria_values", "file_name"], index=pd.RangeIndex(self.cf.min_save_thresh, self.cf.num_epochs, name="epoch")) def run_model_selection(self, net, optimizer, monitor_metrics, epoch): """rank epoch via weighted mean from self.cf.model_selection_criteria: {criterion : weight} :param net: :param optimizer: :param monitor_metrics: :param epoch: :return: """ crita = self.cf.model_selection_criteria # shorter alias metrics = monitor_metrics['val'] epoch_score = np.sum([metrics[criterion][-1] * weight for criterion, weight in crita.items() if not np.isnan(metrics[criterion][-1])]) if not self.cf.resume: epoch_score_check = np.sum([metrics[criterion][epoch] * weight for criterion, weight in crita.items() if not np.isnan(metrics[criterion][epoch])]) assert np.all(epoch_score == epoch_score_check) self.model_index.loc[epoch, ["score", "criteria_values"]] = epoch_score, {cr: metrics[cr][-1] for cr in crita.keys()} nonna_ics = self.model_index["score"].dropna(axis=0).index order = np.argsort(self.model_index.loc[nonna_ics, "score"].to_numpy(), kind="stable")[::-1] self.model_index.loc[nonna_ics, "rank"] = np.argsort(order) + 1 # no zero-indexing for ranks (best rank is 1). rank = int(self.model_index.loc[epoch, "rank"]) if rank <= self.cf.save_n_models: name = '{}_best_params.pth'.format(epoch) if self.cf.server_env: IO_safe(torch.save, net.state_dict(), os.path.join(self.cf.fold_dir, name)) else: torch.save(net.state_dict(), os.path.join(self.cf.fold_dir, name)) self.model_index.loc[epoch, "file_name"] = name self.logger.info("saved current epoch {} at rank {}".format(epoch, rank)) clean_up = self.model_index.dropna(axis=0, subset=["file_name"]) clean_up = clean_up[clean_up["rank"] > self.cf.save_n_models] if clean_up.size > 0: file_name = clean_up["file_name"].to_numpy().item() subprocess.call("rm {}".format(os.path.join(self.cf.fold_dir, file_name)), shell=True) self.logger.info("removed outranked epoch {} at {}".format(clean_up.index.values.item(), os.path.join(self.cf.fold_dir, file_name))) self.model_index.loc[clean_up.index, "file_name"] = np.nan state = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'model_index': self.model_index, } if self.cf.server_env: IO_safe(torch.save, state, os.path.join(self.cf.fold_dir, 'last_state.pth')) else: torch.save(state, os.path.join(self.cf.fold_dir, 'last_state.pth')) def parse_params_for_optim(net: torch.nn.Module, weight_decay: float = 0., exclude_from_wd: Iterable = ("norm",)): """Format network parameters for the optimizer. Convenience function to include options for group-specific settings like weight decay. :param net: :param weight_decay: :param exclude_from_wd: List of strings of parameter-group names to exclude from weight decay. Options: "norm", "bias". :return: """ # pytorch implements parameter groups as dicts {'params': ...} and # weight decay as p.data.mul_(1 - group['lr'] * group['weight_decay']) norm_types = [torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.GroupNorm, torch.nn.SyncBatchNorm, torch.nn.LocalResponseNorm ] level_map = {"bias": "weight", "norm": "module"} type_map = {"norm": norm_types} exclude_from_wd = [str(name).lower() for name in exclude_from_wd] exclude_weight_names = [k for k, v in level_map.items() if k in exclude_from_wd and v == "weight"] exclude_module_types = tuple([type_ for k, v in level_map.items() if (k in exclude_from_wd and v == "module") for type_ in type_map[k]]) if exclude_from_wd: print("excluding {} from weight decay.".format(exclude_from_wd)) with_dec, no_dec = [], [] for name, module in net.named_modules(): if isinstance(module, exclude_module_types): no_dec.extend(module.parameters()) else: - for param_name, param in module.named_parameters(): + for param_name, param in module.named_parameters(recurse=False): if np.any([ename in param_name for ename in exclude_weight_names]): no_dec.append(param) else: with_dec.append(param) groups = [{'params': gr, 'weight_decay': wd} for gr, wd in [(no_dec, 0.), (with_dec, weight_decay)] if len(gr) > 0] return groups def load_checkpoint(checkpoint_path, net, optimizer, model_selector): checkpoint = torch.load(checkpoint_path) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) model_selector.model_index = checkpoint["model_index"] return checkpoint['epoch'] + 1, net, optimizer, model_selector def prep_exp(dataset_path, exp_path, server_env, use_stored_settings=True, is_training=True): """ I/O handling, creating of experiment folder structure. Also creates a snapshot of configs/model scripts and copies them to the exp_dir. This way the exp_dir contains all info needed to conduct an experiment, independent to changes in actual source code. Thus, training/inference of this experiment can be started at anytime. Therefore, the model script is copied back to the source code dir as tmp_model (tmp_backbone). Provides robust structure for cloud deployment. :param dataset_path: path to source code for specific data set. (e.g. medicaldetectiontoolkit/lidc_exp) :param exp_path: path to experiment directory. :param server_env: boolean flag. pass to configs script for cloud deployment. :param use_stored_settings: boolean flag. When starting training: If True, starts training from snapshot in existing experiment directory, else creates experiment directory on the fly using configs/model scripts from source code. :param is_training: boolean flag. distinguishes train vs. inference mode. :return: configs object. """ if is_training: if use_stored_settings: cf_file = import_module('cf', os.path.join(exp_path, 'configs.py')) cf = cf_file.Configs(server_env) # in this mode, previously saved model and backbone need to be found in exp dir. if not os.path.isfile(os.path.join(exp_path, 'model.py')) or \ not os.path.isfile(os.path.join(exp_path, 'backbone.py')): raise Exception( "Selected use_stored_settings option but no model and/or backbone source files exist in exp dir.") cf.model_path = os.path.join(exp_path, 'model.py') cf.backbone_path = os.path.join(exp_path, 'backbone.py') else: # this case overwrites settings files in exp dir, i.e., default_configs, configs, backbone, model os.makedirs(exp_path, exist_ok=True) # run training with source code info and copy snapshot of model to exp_dir for later testing (overwrite scripts if exp_dir already exists.) subprocess.call('cp {} {}'.format('default_configs.py', os.path.join(exp_path, 'default_configs.py')), shell=True) subprocess.call( 'cp {} {}'.format(os.path.join(dataset_path, 'configs.py'), os.path.join(exp_path, 'configs.py')), shell=True) cf_file = import_module('cf_file', os.path.join(dataset_path, 'configs.py')) cf = cf_file.Configs(server_env) subprocess.call('cp {} {}'.format(cf.model_path, os.path.join(exp_path, 'model.py')), shell=True) subprocess.call('cp {} {}'.format(cf.backbone_path, os.path.join(exp_path, 'backbone.py')), shell=True) if os.path.isfile(os.path.join(exp_path, "fold_ids.pickle")): subprocess.call('rm {}'.format(os.path.join(exp_path, "fold_ids.pickle")), shell=True) else: # testing, use model and backbone stored in exp dir. cf_file = import_module('cf', os.path.join(exp_path, 'configs.py')) cf = cf_file.Configs(server_env) cf.model_path = os.path.join(exp_path, 'model.py') cf.backbone_path = os.path.join(exp_path, 'backbone.py') cf.exp_dir = exp_path cf.test_dir = os.path.join(cf.exp_dir, 'test') cf.plot_dir = os.path.join(cf.exp_dir, 'plots') if not os.path.exists(cf.test_dir): os.mkdir(cf.test_dir) if not os.path.exists(cf.plot_dir): os.mkdir(cf.plot_dir) cf.experiment_name = exp_path.split("/")[-1] cf.dataset_name = dataset_path cf.server_env = server_env cf.created_fold_id_pickle = False return cf