diff --git a/default_configs.py b/default_configs.py index 19af770..90f32cf 100644 --- a/default_configs.py +++ b/default_configs.py @@ -1,140 +1,142 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Default Configurations script. Avoids changing configs of all experiments if general settings are to be changed.""" import os class DefaultConfigs: def __init__(self, model, server_env=None, dim=2): self.server_env = server_env ######################### # I/O # ######################### self.model = model self.dim = dim # int [0 < dataset_size]. select n patients from dataset for prototyping. self.select_prototype_subset = None # some default paths. self.backbone_path = 'models/backbone.py' self.source_dir = os.path.dirname(os.path.realpath(__file__)) #current dir. self.input_df_name = 'info_df.pickle' self.model_path = 'models/{}.py'.format(self.model) if server_env: self.source_dir = '/home/jaegerp/code/mamma_code/medicaldetectiontoolkit' ######################### # Data Loader # ######################### #random seed for fold_generator and batch_generator. self.seed = 0 #number of threads for multithreaded batch generation. self.n_workers = os.cpu_count() - 1 # if True, segmentation losses learn all categories, else only foreground vs. background. self.class_specific_seg_flag = False ######################### # Architecture # ######################### self.weight_decay = 0.0 # nonlinearity to be applied after convs with nonlinearity. one of 'relu' or 'leaky_relu' self.relu = 'relu' # if True initializes weights as specified in model script. else use default Pytorch init. self.custom_init = False # if True adds high-res decoder levels to feature pyramid: P1 + P0. (e.g. set to true in retina_unet configs) self.operate_stride1 = False ######################### # Schedule # ######################### # number of folds in cross validation. self.n_cv_splits = 5 # number of probabilistic samples in validation. self.n_probabilistic_samples = None ######################### # Testing / Plotting # ######################### # perform mirroring at test time. (only XY. Z not done to not blow up predictions times). self.test_aug = True # if True, test data lies in a separate folder and is not part of the cross validation. self.hold_out_test_set = False # if hold_out_test_set provided, ensemble predictions over models of all trained cv-folds. + # implications for hold-out test sets: if True, evaluate folds separately on the test set, aggregate only the + # evaluations. if False, aggregate the raw predictions across all folds, then evaluate. self.ensemble_folds = False # color specifications for all box_types in prediction_plot. self.box_color_palette = {'det': 'b', 'gt': 'r', 'neg_class': 'purple', 'prop': 'w', 'pos_class': 'g', 'pos_anchor': 'c', 'neg_anchor': 'c'} # scan over confidence score in evaluation to optimize it on the validation set. self.scan_det_thresh = False # plots roc-curves / prc-curves in evaluation. self.plot_stat_curves = False # evaluates average precision per image and averages over images. instead computing one ap over data set. self.per_patient_ap = False # threshold for clustering 2D box predictions to 3D Cubes. Overlap is computed in XY. self.merge_3D_iou = 0.1 # monitor any value from training. self.n_monitoring_figures = 1 # dict to assign specific plot_values to monitor_figures > 0. {1: ['class_loss'], 2: ['kl_loss', 'kl_sigmas']} self.assign_values_to_extra_figure = {} # save predictions to csv file in experiment dir. self.save_preds_to_csv = True # select a maximum number of patient cases to test. number or "all" for all self.max_test_patients = "all" ######################### # MRCNN # ######################### # if True, mask loss is not applied. used for data sets, where no pixel-wise annotations are provided. self.frcnn_mode = False # if True, unmolds masks in Mask R-CNN to full-res for plotting/monitoring. self.return_masks_in_val = False self.return_masks_in_test = False # needed if doing instance segmentation. evaluation not yet implemented. # add P6 to Feature Pyramid Network. self.sixth_pooling = False # for probabilistic detection self.n_latent_dims = 0 diff --git a/evaluator.py b/evaluator.py index e7f6bcf..349e7d3 100644 --- a/evaluator.py +++ b/evaluator.py @@ -1,492 +1,499 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os import numpy as np import pandas as pd import torch from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.metrics import roc_curve, precision_recall_curve import utils.model_utils as mutils import plotting from multiprocessing import Pool class Evaluator(): def __init__(self, cf, logger, mode='test'): """ :param mode: either 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms. """ self.cf = cf self.logger = logger self.mode = mode self.plot_dir = self.cf.test_dir if self.mode == "test" else self.cf.plot_dir if self.cf.plot_prediction_histograms: self.hist_dir = os.path.join(self.plot_dir, 'histograms') os.makedirs(self.hist_dir, exist_ok=True) if self.cf.plot_stat_curves: self.curves_dir = os.path.join(self.plot_dir, 'stat_curves') os.makedirs(self.curves_dir, exist_ok=True) def eval_losses(self, batch_res_dicts): if hasattr(self.cf, "losses_to_monitor"): loss_names = self.cf.losses_to_monitor else: loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name} self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name in b_res_dict.keys()]).mean().item() for l_name in loss_names} def eval_boxes(self, batch_res_dicts, pid_list): """ """ df_list_preds = [] df_list_labels = [] df_list_class_preds = [] df_list_pids = [] df_list_type = [] df_list_match_iou = [] if self.mode == 'train' or self.mode=='val_sampling': # one pid per batch element # batch_size > 1, with varying patients across batch: # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [results_0, results_1, ..] batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # len: nr of batches in epoch batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in whole_batch_boxes] else: # patient processing, one element per batch = one patient. # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] assert len(batch_inst_boxes) == len(pid_list) for match_iou in self.cf.ap_match_ious: self.logger.info('evaluating with match_iou: {}'.format(match_iou)) for cl in list(self.cf.class_dict.keys()): for pix, pid in enumerate(pid_list): len_df_list_before_patient = len(df_list_pids) # input of each batch element is a list of boxes, where each box is a dictionary. for bix, b_boxes_list in enumerate(batch_inst_boxes[pix]): b_tar_boxes = np.array([box['box_coords'] for box in b_boxes_list if (box['box_type'] == 'gt' and box['box_label'] == cl)]) b_cand_boxes = np.array([box['box_coords'] for box in b_boxes_list if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]) b_cand_scores = np.array([box['box_score'] for box in b_boxes_list if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)]) # check if predictions and ground truth boxes exist and match them according to match_iou. if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes) match_cand_ixs = np.argwhere(np.max(overlaps, 1) > match_iou)[:, 0] non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0] match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :], 1) if not 0 in match_cand_ixs.shape else np.array([]) non_match_gt_ixs = np.array( [ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs]) unique, counts = np.unique(match_gt_ixs, return_counts=True) # check for double assignments, i.e. two predictions having been assigned to the same gt. # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as # false positive. This case is supposed to be avoided by the model itself by, # e.g. using a low enough NMS threshold. if np.any(counts > 1): double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]] keep_max = [] double_match_list = [] for dg in double_match_gt_ixs: double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)] keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])]) double_match_list += [ii for ii in double_match_cand_ixs] fp_ixs = np.array([ii for ii in match_cand_ixs if (ii in double_match_list and ii not in keep_max)]) match_cand_ixs = np.array([ii for ii in match_cand_ixs if ii not in fp_ixs]) df_list_preds += [ii for ii in b_cand_scores[fp_ixs]] df_list_labels += [0] * fp_ixs.shape[0] df_list_class_preds += [cl] * fp_ixs.shape[0] df_list_pids += [pid] * fp_ixs.shape[0] df_list_type += ['det_fp'] * fp_ixs.shape[0] # matched: if not 0 in match_cand_ixs.shape: df_list_preds += [ii for ii in b_cand_scores[match_cand_ixs]] df_list_labels += [1] * match_cand_ixs.shape[0] df_list_class_preds += [cl] * match_cand_ixs.shape[0] df_list_pids += [pid] * match_cand_ixs.shape[0] df_list_type += ['det_tp'] * match_cand_ixs.shape[0] # rest fp: if not 0 in non_match_cand_ixs.shape: df_list_preds += [ii for ii in b_cand_scores[non_match_cand_ixs]] df_list_labels += [0] * non_match_cand_ixs.shape[0] df_list_class_preds += [cl] * non_match_cand_ixs.shape[0] df_list_pids += [pid] * non_match_cand_ixs.shape[0] df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0] # rest fn: if not 0 in non_match_gt_ixs.shape: df_list_preds += [0] * non_match_gt_ixs.shape[0] df_list_labels += [1] * non_match_gt_ixs.shape[0] df_list_class_preds += [cl] * non_match_gt_ixs.shape[0] df_list_pids += [pid] * non_match_gt_ixs.shape[0] df_list_type += ['det_fn'] * non_match_gt_ixs.shape[0] # only fp: if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape: df_list_preds += [ii for ii in b_cand_scores] df_list_labels += [0] * b_cand_scores.shape[0] df_list_class_preds += [cl] * b_cand_scores.shape[0] df_list_pids += [pid] * b_cand_scores.shape[0] df_list_type += ['det_fp'] * b_cand_scores.shape[0] # only fn: if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape: df_list_preds += [0] * b_tar_boxes.shape[0] df_list_labels += [1] * b_tar_boxes.shape[0] df_list_class_preds += [cl] * b_tar_boxes.shape[0] df_list_pids += [pid] * b_tar_boxes.shape[0] df_list_type += ['det_fn'] * b_tar_boxes.shape[0] # empty patient with 0 detections needs patient dummy score, in order to not disappear from stats. # filtered out for roi-level evaluation later. During training (and val_sampling), # tn are assigned per sample independently of associated patients. if len(df_list_pids) == len_df_list_before_patient: df_list_preds += [0] * 1 df_list_labels += [0] * 1 df_list_class_preds += [cl] * 1 df_list_pids += [pid] * 1 df_list_type += ['patient_tn'] * 1 # true negative: no ground truth boxes, no detections. df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou)) self.test_df = pd.DataFrame() self.test_df['pred_score'] = df_list_preds self.test_df['class_label'] = df_list_labels self.test_df['pred_class'] = df_list_class_preds self.test_df['pid'] = df_list_pids self.test_df['det_type'] = df_list_type self.test_df['fold'] = self.cf.fold self.test_df['match_iou'] = df_list_match_iou def evaluate_predictions(self, results_list, monitor_metrics=None): """ Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes. Resulting info of each prediction is stored as one line in an internal dataframe, with the keys: det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative) pred_class: foreground class which the object predicts. pid: corresponding patient-id. pred_score: confidence score [0, 1] fold: corresponding fold of CV. match_iou: utilized IoU for matching. :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form: [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...]) :param monitor_metrics (optional): dict of dicts with all metrics of previous epochs. :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch. """ self.logger.info('evaluating in mode {}'.format(self.mode)) batch_res_dicts = [batch[0] for batch in results_list] # len: nr of batches in epoch if self.mode == 'train' or self.mode == 'val_sampling': # one pid per batch element # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...] # -> [pid_0, pid_1, ...] # additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]] elif self.mode == "val_patient" or self.mode == "test": # [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...] # in patientbatchiterator there is only one pid per batch pid_list = [np.unique(batch[1]) for batch in results_list] assert np.all([len(pid) == 1 for pid in pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format( pid_list) pid_list = [pid[0] for pid in pid_list] # todo remove assert pid_list_orig = [item[1] for item in results_list] assert np.all(pid_list == pid_list_orig) else: raise Exception("undefined run mode encountered") self.eval_losses(batch_res_dicts) self.eval_boxes(batch_res_dicts, pid_list) if monitor_metrics is not None: # return all_stats, updated monitor_metrics return self.return_metrics(monitor_metrics) def return_metrics(self, monitor_metrics=None): """ calculates AP/AUC scores for internal dataframe. called directly from evaluate_predictions during training for monitoring, or from score_test_df during inference (for single folds or aggregated test set). Loops over foreground classes and score_levels (typically 'roi' and 'patient'), gets scores and stores them. Optionally creates plots of prediction histograms and roc/prc curves. :param monitor_metrics: dict of dicts with all metrics of previous epochs. this function adds metrics for current epoch and returns the same object. :return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and score_level. :return: monitor_metrics """ # -------------- monitoring independent of class, score level ------------ if monitor_metrics is not None: for l_name in self.epoch_losses: monitor_metrics[l_name] = [self.epoch_losses[l_name]] df = self.test_df all_stats = [] for cl in list(self.cf.class_dict.keys()): cl_df = df[df.pred_class == cl] for score_level in self.cf.report_score_level: stats_dict = {} stats_dict['name'] = 'fold_{} {} cl_{}'.format(self.cf.fold, score_level, cl) if score_level == 'rois': # kick out dummy entries for true negative patients. not needed on roi-level. spec_df = cl_df[cl_df.det_type != 'patient_tn'] stats_dict['ap'] = get_roi_ap_from_df([spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap]) # AUC not sensible on roi-level, since true negative box predictions do not exist. Would reward # higher amounts of low confidence false positives. stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan stats_dict['prc'] = np.nan # for the aggregated test set case, additionally get the scores for averaging over fold results. if len(df.fold.unique()) > 1: aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] aps.append(get_roi_ap_from_df([fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap])) stats_dict['mean_ap'] = np.mean(aps) stats_dict['mean_auc'] = 0 # on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest # confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0. if score_level == 'patient': spec_df = cl_df.groupby(['pid'], as_index=False).agg({'class_label': 'max', 'pred_score': 'max', 'fold': 'first'}) if len(spec_df.class_label.unique()) > 1: stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) else: stats_dict['auc'] = np.nan stats_dict['roc'] = np.nan if (spec_df.class_label == 1).any(): stats_dict['ap'] = average_precision_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist()) else: stats_dict['ap'] = np.nan stats_dict['prc'] = np.nan # for the aggregated test set case, additionally get the scores for averaging over fold results. if len(df.fold.unique()) > 1: aucs = [] aps = [] for fold in df.fold.unique(): fold_df = spec_df[spec_df.fold == fold] if len(fold_df.class_label.unique()) > 1: aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) if (fold_df.class_label == 1).any(): aps.append(average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist())) stats_dict['mean_auc'] = np.mean(aucs) stats_dict['mean_ap'] = np.mean(aps) # fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level. if monitor_metrics is not None and not (score_level == 'patient' and cl != self.cf.patient_class_of_interest): score_level_name = 'patient' if score_level == 'patient' else self.cf.class_dict[cl] monitor_metrics[score_level_name + '_ap'].append(stats_dict['ap'] if stats_dict['ap'] > 0 else np.nan) if score_level == 'patient': monitor_metrics[score_level_name + '_auc'].append( stats_dict['auc'] if stats_dict['auc'] > 0 else np.nan) if self.cf.plot_prediction_histograms: out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_cl{}'.format( self.cf.fold, 'val' if 'val' in self.mode else self.mode, score_level, cl)) type_list = None if score_level == 'patient' else spec_df.det_type.tolist() plotting.plot_prediction_hist(spec_df.class_label.tolist(), spec_df.pred_score.tolist(), type_list, out_filename) all_stats.append(stats_dict) # analysis of the hyper-parameter cf.min_det_thresh, for optimization on validation set. if self.cf.scan_det_thresh: conf_threshs = list(np.arange(0.9, 1, 0.01)) pool = Pool(processes=10) mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs] aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1) pool.close() pool.join() self.logger.info('results from scanning over det_threshs:', [[i, j] for i, j in zip(conf_threshs, aps)]) if self.cf.plot_stat_curves: out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode)) plotting.plot_stat_curves(all_stats, out_filename) # get average stats over foreground classes on roi level. avg_ap = np.mean([d['ap'] for d in all_stats if 'rois' in d['name']]) all_stats.append({'name': 'average_foreground_roi', 'auc': 0, 'ap': avg_ap}) if len(df.fold.unique()) > 1: avg_mean_ap = np.mean([d['mean_ap'] for d in all_stats if 'rois' in d['name']]) all_stats[-1]['mean_ap'] = avg_mean_ap all_stats[-1]['mean_auc'] = 0 # in small data sets, values of model_selection_criterion can be identical across epochs, wich breaks the # ranking of model_selector. Thus, pertube identical values by a neglectibale random term. for sc in self.cf.model_selection_criteria: if 'val' in self.mode and monitor_metrics[sc].count(monitor_metrics[sc][-1]) > 1 and monitor_metrics[sc][-1] is not None: monitor_metrics[sc][-1] += 1e-6 * np.random.rand() return all_stats, monitor_metrics def score_test_df(self, internal_df=True): """ Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold, gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds, and calculates and writes out overall metrics. """ if internal_df: self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pickle'.format(self.cf.fold))) stats, _ = self.return_metrics() with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: handle.write('\n****************************\n') handle.write('\nresults for fold {} \n'.format(self.cf.fold)) handle.write('\n****************************\n') handle.write('\nfold df shape {}\n \n'.format(self.test_df.shape)) for s in stats: handle.write('AUC {:0.4f} AP {:0.4f} {} \n'.format(s['auc'], s['ap'], s['name'])) - fold_df_paths = [ii for ii in os.listdir(self.cf.test_dir) if 'test_df.pickle' in ii] - if len(fold_df_paths) == self.cf.n_cv_splits: - with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: - self.cf.fold = 'overall' - dfs_list = [pd.read_pickle(os.path.join(self.cf.exp_dir, ii)) for ii in fold_df_paths] - for ix, df in enumerate(dfs_list): - df['fold'] = ix - self.test_df = pd.concat(dfs_list) - stats, _ = self.return_metrics() - handle.write('\n****************************\n') - handle.write('\nOVERALL RESULTS \n') - handle.write('\n****************************\n') - handle.write('\ndf shape \n \n'.format(self.test_df.shape)) - for s in stats: - handle.write('\nAUC {:0.4f} (mu {:0.4f}) AP {:0.4f} (mu {:0.4f}) {}\n ' - .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name'])) + fold_df_paths = [ii for ii in os.listdir(self.cf.test_dir) if ('test_df.pickle' in ii and not 'overall' in ii)] + if len(fold_df_paths) == self.cf.n_cv_splits and self.cf.fold == self.cf.n_cv_splits - 1: results_table_path = os.path.join(("/").join(self.cf.exp_dir.split("/")[:-1]), 'results_table.txt') - with open(results_table_path, 'a') as handle2: - for s in stats: - handle2.write('\nAUC {:0.4f} (mu {:0.4f}) AP {:0.4f} (mu {:0.4f}) {} {}' - .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name'], self.cf.exp_dir.split('/')[-1])) - handle2.write('\n') + if not self.cf.hold_out_test_set or not self.cf.ensemble_folds: + with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle: + self.cf.fold = 'overall' + dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths] + for ix, df in enumerate(dfs_list): + df['fold'] = ix + self.test_df = pd.concat(dfs_list) + stats, _ = self.return_metrics() + handle.write('\n****************************\n') + handle.write('\nOVERALL RESULTS \n') + handle.write('\n****************************\n') + handle.write('\ndf shape \n \n'.format(self.test_df.shape)) + for s in stats: + handle.write('\nAUC {:0.4f} (mu {:0.4f}) AP {:0.4f} (mu {:0.4f}) {}\n ' + .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name'])) + with open(results_table_path, 'a') as handle2: + for s in stats: + handle2.write('\nAUC {:0.4f} (mu {:0.4f})\t AP {:0.4f} (mu {:0.4f})\t {} {}' + .format(s['auc'], s['mean_auc'], s['ap'], s['mean_ap'], s['name'], self.cf.exp_dir.split('/')[-1])) + handle2.write('\n') + else: + with open(results_table_path, 'a') as handle2: + for s in stats: + handle2.write('\nAUC {:0.4f} \t\t\t AP {:0.4f} \t\t\t {} {}' + .format(s['auc'], s['ap'], s['name'], self.cf.exp_dir.split('/')[-1])) + handle2.write('\n') def get_roi_ap_from_df(inputs): ''' :param df: data frame. :param det_thresh: min_threshold for filtering out low confidence predictions. :param per_patient_ap: boolean flag. evaluate average precision per image and average over images, instead of computing one ap over data set. :return: average_precision (float) ''' df, det_thresh, per_patient_ap = inputs if per_patient_ap: pids_list = df.pid.unique() aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] for pid in pids_list: pid_df = iou_df[iou_df.pid == pid] all_p = len(pid_df[pid_df.class_label == 1]) pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) pid_df = pid_df[pid_df.pred_score > det_thresh] if (len(pid_df) ==0 and all_p == 0): pass elif (len(pid_df) > 0 and all_p == 0): aps.append(0) else: aps.append(compute_roi_ap(pid_df, all_p)) return np.mean(aps) else: aps = [] for match_iou in df.match_iou.unique(): iou_df = df[df.match_iou == match_iou] all_p = len(iou_df[iou_df.class_label == 1]) iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False) iou_df = iou_df[iou_df.pred_score > det_thresh] if all_p > 0: aps.append(compute_roi_ap(iou_df, all_p)) return np.mean(aps) def compute_roi_ap(df, all_p): """ adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py :param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score. :param all_p: number of all ground truth objects. (for denominator of recall.) :return: """ tp = df.class_label.values fp = (tp == 0) * 1 #recall thresholds, where precision will be measured R = np.linspace(.0, 1, 101, endpoint=True) tp_sum = np.cumsum(tp) fp_sum = np.cumsum(fp) nd = len(tp) rc = tp_sum / all_p pr = tp_sum / (fp_sum + tp_sum) # initialize precision array over recall steps. q = np.zeros((len(R),)) # numpy is slow without cython optimization for accessing elements # use python array gets significant speed improvement pr = pr.tolist() q = q.tolist() for i in range(nd - 1, 0, -1): if pr[i] > pr[i - 1]: pr[i - 1] = pr[i] #discretize empiric recall steps with given bins. inds = np.searchsorted(rc, R, side='left') try: for ri, pi in enumerate(inds): q[ri] = pr[pi] except: pass return np.mean(q) \ No newline at end of file diff --git a/exec.py b/exec.py index 8d223fb..0b06228 100644 --- a/exec.py +++ b/exec.py @@ -1,285 +1,288 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """execution script.""" import argparse import os, warnings import time import torch import utils.exp_utils as utils from evaluator import Evaluator from predictor import Predictor from plotting import plot_batch_prediction for msg in ["Attempting to set identical bottom==top results", "This figure includes Axes that are not compatible with tight_layout", "Data has no positive values, and therefore cannot be log-scaled.", ".*invalid value encountered in double_scalars.*", ".*Mean of empty slice.*"]: warnings.filterwarnings("ignore", msg) def train(logger): """ perform the training routine for a given fold. saves plots and selected parameters to the experiment dir specified in the configs. """ logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format( cf.dim, cf.fold, cf.exp_dir, cf.model)) net = model.net(cf, logger).cuda() optimizer = torch.optim.AdamW(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay) if cf.dynamic_lr_scheduling: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor, patience=cf.scheduling_patience) model_selector = utils.ModelSelector(cf, logger) train_evaluator = Evaluator(cf, logger, mode='train') val_evaluator = Evaluator(cf, logger, mode=cf.val_mode) starting_epoch = 1 # prepare monitoring monitor_metrics = utils.prepare_monitoring(cf) if cf.resume: checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint") starting_epoch, net, optimizer, monitor_metrics = \ utils.load_checkpoint(checkpoint_path, net, optimizer) logger.info('resumed from checkpoint {} to epoch {}'.format(checkpoint_path, starting_epoch)) logger.info('loading dataset and initializing batch generators...') batch_gen = data_loader.get_train_generators(cf, logger) for epoch in range(starting_epoch, cf.num_epochs + 1): logger.info('starting training epoch {}'.format(epoch)) start_time = time.time() net.train() train_results_list = [] for bix in range(cf.num_train_batches): batch = next(batch_gen['train']) tic_fw = time.time() results_dict = net.train_forward(batch) tic_bw = time.time() optimizer.zero_grad() results_dict['torch_loss'].backward() optimizer.step() print('\rtr. batch {0}/{1} (ep. {2}) fw {3:.2f}s / bw {4:.2f} s / total {5:.2f} s || '.format( bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw, time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'], flush=True, end="") train_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"])) print() _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(train_results_list, monitor_metrics['train']) logger.info('generating training example plot.') plot_batch_prediction(batch, results_dict, cf, outfile=os.path.join( cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold))) train_time = time.time() - start_time logger.info('starting validation in mode {}.'.format(cf.val_mode)) with torch.no_grad(): net.eval() if cf.do_validation: val_results_list = [] val_predictor = Predictor(cf, net, logger, mode='val') for _ in range(batch_gen['n_val']): batch = next(batch_gen[cf.val_mode]) if cf.val_mode == 'val_patient': results_dict = val_predictor.predict_patient(batch) elif cf.val_mode == 'val_sampling': results_dict = net.train_forward(batch, is_validation=True) #val_results_list.append([results_dict['boxes'], batch['pid']]) val_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"])) _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(val_results_list, monitor_metrics['val']) model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch) # update monitoring and prediction plots monitor_metrics.update({"lr": {str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups)}}) logger.metrics2tboard(monitor_metrics, global_step=epoch) epoch_time = time.time() - start_time logger.info('trained epoch {}: took {:.2f} s ({:.2f} s train / {:.2f} s val)'.format( epoch, epoch_time, train_time, epoch_time-train_time)) batch = next(batch_gen['val_sampling']) results_dict = net.train_forward(batch, is_validation=True) logger.info('generating validation-sampling example plot.') plot_batch_prediction(batch, results_dict, cf, outfile=os.path.join( cf.plot_dir, 'pred_example_{}_val.png'.format(cf.fold))) # -------------- scheduling ----------------- if cf.dynamic_lr_scheduling: scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1]) else: for param_group in optimizer.param_groups: param_group['lr'] = cf.learning_rate[epoch-1] def test(logger): """ perform testing for a given fold (or hold out set). save stats in evaluator. """ logger.info('starting testing model of fold {} in exp {}'.format(cf.fold, cf.exp_dir)) net = model.net(cf, logger).cuda() test_predictor = Predictor(cf, net, logger, mode='test') test_evaluator = Evaluator(cf, logger, mode='test') batch_gen = data_loader.get_test_generator(cf, logger) test_results_list = test_predictor.predict_test_set(batch_gen, return_results=True) test_evaluator.evaluate_predictions(test_results_list) test_evaluator.score_test_df() if __name__ == '__main__': stime = time.time() parser = argparse.ArgumentParser() parser.add_argument('-m', '--mode', type=str, default='train_test', help='one out of: train / test / train_test / analysis / create_exp') parser.add_argument('-f','--folds', nargs='+', type=int, default=None, help='None runs over all folds in CV. otherwise specify list of folds.') parser.add_argument('--exp_dir', type=str, default='/path/to/experiment/directory', help='path to experiment dir. will be created if non existent.') parser.add_argument('--server_env', default=False, action='store_true', help='change IO settings to deploy models on a cluster.') parser.add_argument('--data_dest', type=str, default=None, help="path to final data folder if different from config.") parser.add_argument('--use_stored_settings', default=False, action='store_true', help='load configs from existing exp_dir instead of source dir. always done for testing, ' 'but can be set to true to do the same for training. useful in job scheduler environment, ' 'where source code might change before the job actually runs.') parser.add_argument('--resume', action="store_true", default=False, help='if given, resume from checkpoint(s) of the specified folds.') parser.add_argument('--exp_source', type=str, default='experiments/toy_exp', help='specifies, from which source experiment to load configs and data_loader.') parser.add_argument('--no_benchmark', action='store_true', help="Do not use cudnn.benchmark.") parser.add_argument('-d', '--dev', default=False, action='store_true', help="development mode: shorten everything") args = parser.parse_args() folds = args.folds torch.backends.cudnn.benchmark = not args.no_benchmark if args.mode == 'train' or args.mode == 'train_test': cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, args.use_stored_settings) if args.dev: folds = [0,1] - cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 1, 0, 1 + cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 1, 0, 2 cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 5, 1, 1 cf.test_n_epochs = cf.save_n_models - cf.max_test_patients = 1 + cf.max_test_patients = 2 cf.data_dest = args.data_dest logger = utils.get_logger(cf.exp_dir, cf.server_env) logger.info("cudnn benchmark: {}, deterministic: {}.".format(torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic)) data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py')) model = utils.import_module('model', cf.model_path) logger.info("loaded model from {}".format(cf.model_path)) if folds is None: folds = range(cf.n_cv_splits) for fold in folds: cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold)) cf.fold = fold cf.resume = args.resume if not os.path.exists(cf.fold_dir): os.mkdir(cf.fold_dir) logger.set_logfile(fold=fold) train(logger) cf.resume = False if args.mode == 'train_test': test(logger) elif args.mode == 'test': cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True) if args.dev: folds = [0,1] - cf.test_n_epochs = 1; cf.max_test_patients = 1 + cf.test_n_epochs = 2; cf.max_test_patients = 2 cf.data_dest = args.data_dest logger = utils.get_logger(cf.exp_dir, cf.server_env) data_loader = utils.import_module('dl', os.path.join(args.exp_source, 'data_loader.py')) model = utils.import_module('model', cf.model_path) logger.info("loaded model from {}".format(cf.model_path)) if folds is None: folds = range(cf.n_cv_splits) for fold in folds: cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold)) cf.fold = fold logger.set_logfile(fold=fold) test(logger) # load raw predictions saved by predictor during testing, run aggregation algorithms and evaluation. elif args.mode == 'analysis': cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, is_training=False, use_stored_settings=True) logger = utils.get_logger(cf.exp_dir, cf.server_env) - if cf.hold_out_test_set: - cf.folds = args.folds + if args.dev: + cf.test_n_epochs = 2 + + if cf.hold_out_test_set and cf.ensemble_folds: + # create and save (unevaluated) predictions across all folds predictor = Predictor(cf, net=None, logger=logger, mode='analysis') results_list = predictor.load_saved_predictions(apply_wbc=True) utils.create_csv_output([(res_dict["boxes"], pid) for res_dict, pid in results_list], cf, logger) - logger.info('starting evaluation...') - cf.fold = "overall" + cf.fold = 'overall_hold_out' evaluator = Evaluator(cf, logger, mode='test') evaluator.evaluate_predictions(results_list) evaluator.score_test_df() + else: fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")]) if folds is None: folds = range(cf.n_cv_splits) for fold in folds: cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold)) cf.fold = fold logger.set_logfile(fold=fold) if cf.fold_dir in fold_dirs: predictor = Predictor(cf, net=None, logger=logger, mode='analysis') results_list = predictor.load_saved_predictions(apply_wbc=True) logger.info('starting evaluation...') evaluator = Evaluator(cf, logger, mode='test') evaluator.evaluate_predictions(results_list) evaluator.score_test_df() else: logger.info("Skipping fold {} since no model parameters found.".format(fold)) # create experiment folder and copy scripts without starting job. # useful for cloud deployment where configs might change before job actually runs. elif args.mode == 'create_exp': cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, use_stored_settings=False) logger = utils.get_logger(cf.exp_dir) logger.info('created experiment directory at {}'.format(cf.exp_dir)) else: raise RuntimeError('mode specified in args is not implemented...') mins, secs = divmod((time.time() - stime), 60) h, mins = divmod(mins, 60) t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs)) logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t)) del logger \ No newline at end of file diff --git a/experiments/lidc_exp/configs.py b/experiments/lidc_exp/configs.py index 1bf3237..42d8d74 100644 --- a/experiments/lidc_exp/configs.py +++ b/experiments/lidc_exp/configs.py @@ -1,341 +1,341 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import sys import os sys.path.append(os.path.dirname(os.path.realpath(__file__))) import numpy as np from default_configs import DefaultConfigs class configs(DefaultConfigs): def __init__(self, server_env=None): ######################### # Preprocessing # ######################### self.root_dir = '/home/gregor/networkdrives/E130-Personal/Goetz/Datenkollektive/Lungendaten/Nodules_LIDC_IDRI' self.raw_data_dir = '{}/new_nrrd'.format(self.root_dir) self.pp_dir = '/media/gregor/HDD2TB/data/lidc/lidc_mdt' self.target_spacing = (0.7, 0.7, 1.25) ######################### # I/O # ######################### # one out of [2, 3]. dimension the model operates in. - self.dim = 2 + self.dim = 3 # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn']. self.model = 'retina_unet' DefaultConfigs.__init__(self, self.model, server_env, self.dim) # int [0 < dataset_size]. select n patients from dataset for prototyping. If None, all data is used. self.select_prototype_subset = None # path to preprocessed data. self.pp_name = 'lidc_mdt' self.input_df_name = 'info_df.pickle' self.pp_data_path = '/media/gregor/HDD2TB/data/lidc/{}'.format(self.pp_name) self.pp_test_data_path = self.pp_data_path #change if test_data in separate folder. # settings for deployment in cloud. if server_env: # path to preprocessed data. self.pp_name = 'lidc_mdt_npz' self.crop_name = 'pp_fg_slices_packed' self.pp_data_path = '/datasets/datasets_ramien/lidc_exp/data/{}'.format(self.pp_name) self.pp_test_data_path = self.pp_data_path self.select_prototype_subset = None ######################### # Data Loader # ######################### # select modalities from preprocessed data self.channels = [0] self.n_channels = len(self.channels) # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation. self.pre_crop_size_2D = [300, 300] self.patch_size_2D = [288, 288] self.pre_crop_size_3D = [156, 156, 96] self.patch_size_3D = [128, 128, 64] self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D # ratio of free sampled batch elements before class balancing is triggered # (>0 to include "empty"/background patches.) self.batch_sample_slack = 0.2 # set 2D network to operate in 3D images. self.merge_2D_to_3D_preds = self.dim == 2 # feed +/- n neighbouring slices into channel dimension. set to None for no context. self.n_3D_context = None if self.n_3D_context is not None and self.dim == 2: self.n_channels *= (self.n_3D_context * 2 + 1) ######################### # Architecture # ######################### self.start_filts = 48 if self.dim == 2 else 18 self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2 self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50' self.norm = None # one of None, 'instance_norm', 'batch_norm' self.weight_decay = 1e-5 # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform') self.weight_init = None ######################### # Schedule / Selection # ######################### self.num_epochs = 100 self.num_train_batches = 200 if self.dim == 2 else 300 self.batch_size = 20 if self.dim == 2 else 8 self.do_validation = True # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training) # the former is morge accurate, while the latter is faster (depending on volume size) self.val_mode = 'val_sampling' # one of 'val_sampling' , 'val_patient' if self.val_mode == 'val_patient': self.max_val_patients = 50 # if 'None' iterates over entire val_set once. if self.val_mode == 'val_sampling': self.num_val_batches = 50 # set dynamic_lr_scheduling to True to apply LR scheduling with below settings. self.dynamic_lr_scheduling = True self.lr_decay_factor = 0.5 self.scheduling_patience = np.ceil(6000 / (self.num_train_batches * self.batch_size)) self.scheduling_criterion = 'malignant_ap' self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max' ######################### # Testing / Plotting # ######################### # set the top-n-epochs to be saved for temporal averaging in testing. self.save_n_models = 5 self.test_n_epochs = 5 # set a minimum epoch number for saving in case of instabilities in the first phase of training. self.min_save_thresh = 1 if self.dim == 2 else 1 self.report_score_level = ['patient', 'rois'] # choose list from 'patient', 'rois' self.class_dict = {1: 'benign', 2: 'malignant'} # 0 is background. self.patient_class_of_interest = 2 # patient metrics are only plotted for one class. self.ap_match_ious = [0.1] # list of ious to be evaluated for ap-scoring. self.model_selection_criteria = ['malignant_ap', 'benign_ap'] # criteria to average over for saving epochs. self.min_det_thresh = 0.1 # minimum confidence value to select predictions for evaluation. # threshold for clustering predictions together (wcs = weighted cluster scoring). # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold). # if too high, preds of the same object are separate clusters. self.wcs_iou = 1e-5 self.plot_prediction_histograms = True self.plot_stat_curves = False ######################### # Data Augmentation # ######################### self.da_kwargs={ 'do_elastic_deform': True, 'alpha':(0., 1500.), 'sigma':(30., 50.), 'do_rotation':True, 'angle_x': (0., 2 * np.pi), 'angle_y': (0., 0), 'angle_z': (0., 0), 'do_scale': True, 'scale':(0.8, 1.1), 'random_crop':False, 'rand_crop_dist': (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3), 'border_mode_data': 'constant', 'border_cval_data': 0, 'order_data': 1 } if self.dim == 3: self.da_kwargs['do_elastic_deform'] = False self.da_kwargs['angle_x'] = (0, 0.0) self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!! self.da_kwargs['angle_z'] = (0., 2 * np.pi) ######################### # Add model specifics # ######################### {'detection_unet': self.add_det_unet_configs, 'mrcnn': self.add_mrcnn_configs, 'ufrcnn': self.add_mrcnn_configs, 'retina_net': self.add_mrcnn_configs, 'retina_unet': self.add_mrcnn_configs, }[self.model]() def add_det_unet_configs(self): self.learning_rate = [1e-4] * self.num_epochs # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median'] self.aggregation_operation = 'max' # max number of roi candidates to identify per batch element and class. self.n_roi_candidates = 10 if self.dim == 2 else 30 # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce') self.seg_loss_mode = 'dice_wce' # if <1, false positive predictions in foreground are penalized less. self.fp_dice_weight = 1 if self.dim == 2 else 1 self.wce_weights = [0.3, 1, 1] self.detection_min_confidence = self.min_det_thresh # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic). self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.head_classes = self.num_seg_classes def add_mrcnn_configs(self): # learning rate is a list with one entry per epoch. self.learning_rate = [3e-4] * self.num_epochs # disable the re-sampling of mask proposals to original size for speed-up. # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching), # mask-outputs are optional. self.return_masks_in_val = True self.return_masks_in_test = False # set number of proposal boxes to plot after each epoch. self.n_plot_rpn_props = 5 if self.dim == 2 else 30 # number of classes for head networks: n_foreground_classes + 1 (background) self.head_classes = 3 # seg_classes hier refers to the first stage classifier (RPN) self.num_seg_classes = 2 # foreground vs. background # feature map strides per pyramid level are inferred from architecture. self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]} # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.) self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]} # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3. self.pyramid_levels = [0, 1, 2, 3] # number of feature maps in rpn. typically lowered in 3D to save gpu-memory. self.n_rpn_features = 512 if self.dim == 2 else 128 # anchor ratios and strides per position in feature maps. self.rpn_anchor_ratios = [0.5, 1, 2] self.rpn_anchor_stride = 1 # Threshold for first stage (RPN) non-maximum suppression (NMS): LOWER == HARDER SELECTION self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7 # loss sampling settings. self.rpn_train_anchors_per_image = 32 #per batch element self.train_rois_per_image = 6 #per batch element self.roi_positive_ratio = 0.5 self.anchor_matching_iou = 0.7 # factor of top-k candidates to draw from per negative sample (stochastic-hard-example-mining). # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples. self.shem_poolsize = 10 self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3) self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5) self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10) self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1], 0, self.patch_size_3D[2]]) self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1], self.patch_size_3D[2], self.patch_size_3D[2]]) if self.dim == 2: self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4] self.bbox_std_dev = self.bbox_std_dev[:4] self.window = self.window[:4] self.scale = self.scale[:4] # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element. self.pre_nms_limit = 3000 if self.dim == 2 else 6000 # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True, # since proposals of the entire batch are forwarded through second stage in as one "batch". self.roi_chunk_size = 2500 if self.dim == 2 else 600 self.post_nms_rois_training = 500 if self.dim == 2 else 75 self.post_nms_rois_inference = 500 # Final selection of detections (refine_detections) self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30 # per batch element and class. self.detection_nms_threshold = 1e-5 # needs to be > 0, otherwise all predictions are one cluster. self.model_min_confidence = 0.1 if self.dim == 2: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride))] for stride in self.backbone_strides['xy']]) else: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride)), int(np.ceil(self.patch_size[2] / stride_z))] for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z'] )]) if self.model == 'ufrcnn': self.operate_stride1 = True self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.frcnn_mode = True if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector': # implement extra anchor-scales according to retina-net publication. self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['xy']] self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['z']] self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3 self.n_rpn_features = 256 if self.dim == 2 else 64 # pre-selection of detections for NMS-speedup. per entire batch. self.pre_nms_limit = 10000 if self.dim == 2 else 50000 # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002 self.anchor_matching_iou = 0.5 # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic). self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 if self.model == 'retina_unet': self.operate_stride1 = True diff --git a/experiments/toy_exp/configs.py b/experiments/toy_exp/configs.py index 807cf1c..f907da1 100644 --- a/experiments/toy_exp/configs.py +++ b/experiments/toy_exp/configs.py @@ -1,351 +1,350 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import sys import os sys.path.append(os.path.dirname(os.path.realpath(__file__))) import numpy as np from default_configs import DefaultConfigs class configs(DefaultConfigs): def __init__(self, server_env=None): ######################### # Preprocessing # ######################### self.root_dir = '/home/gregor/datasets/toy_mdt' ######################### # I/O # ######################### - # one out of [2, 3]. dimension the model operates in. self.dim = 2 # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn']. self.model = 'mrcnn' DefaultConfigs.__init__(self, self.model, server_env, self.dim) # int [0 < dataset_size]. select n patients from dataset for prototyping. self.select_prototype_subset = None self.hold_out_test_set = True # including val set. will be 3/4 train, 1/4 val. self.n_train_val_data = 2500 # choose one of the 3 toy experiments described in https://arxiv.org/pdf/1811.08661.pdf # one of ['donuts_shape', 'donuts_pattern', 'circles_scale']. toy_mode = 'donuts_shape_noise' # path to preprocessed data. self.input_df_name = 'info_df.pickle' self.pp_name = os.path.join(toy_mode, 'train') self.pp_data_path = os.path.join(self.root_dir, self.pp_name) self.pp_test_name = os.path.join(toy_mode, 'test') self.pp_test_data_path = os.path.join(self.root_dir, self.pp_test_name) # settings for deployment in cloud. if server_env: # path to preprocessed data. pp_root_dir = '/datasets/datasets_ramien/toy_exp/data' self.pp_name = os.path.join(toy_mode, 'train') self.pp_data_path = os.path.join(pp_root_dir, self.pp_name) self.pp_test_name = os.path.join(toy_mode, 'test') self.pp_test_data_path = os.path.join(pp_root_dir, self.pp_test_name) self.select_prototype_subset = None ######################### # Data Loader # ######################### # select modalities from preprocessed data self.channels = [0] self.n_channels = len(self.channels) # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation. self.pre_crop_size_2D = [320, 320] self.patch_size_2D = [320, 320] self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D # ratio of free sampled batch elements before class balancing is triggered # (>0 to include "empty"/background patches.) self.batch_sample_slack = 0.2 # set 2D network to operate in 3D images. self.merge_2D_to_3D_preds = False # feed +/- n neighbouring slices into channel dimension. set to None for no context. self.n_3D_context = None if self.n_3D_context is not None and self.dim == 2: self.n_channels *= (self.n_3D_context * 2 + 1) ######################### # Architecture # ######################### self.start_filts = 48 if self.dim == 2 else 18 self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2 self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50' self.norm = None # one of None, 'instance_norm', 'batch_norm' self.weight_decay = 3e-5 # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform') self.weight_init = None ######################### # Schedule / Selection # ######################### self.num_epochs = 24 self.num_train_batches = 100 if self.dim == 2 else 200 self.batch_size = 20 if self.dim == 2 else 8 self.do_validation = True # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training) # the former is morge accurate, while the latter is faster (depending on volume size) self.val_mode = 'val_patient' # one of 'val_sampling' , 'val_patient' if self.val_mode == 'val_patient': self.max_val_patients = None # if 'None' iterates over entire val_set once. if self.val_mode == 'val_sampling': self.num_val_batches = 50 # set dynamic_lr_scheduling to True to apply LR scheduling with below settings. self.dynamic_lr_scheduling = True self.lr_decay_factor = 0.5 self.scheduling_patience = np.ceil(3600 / (self.num_train_batches * self.batch_size)) self.scheduling_criterion = 'malignant_ap' self.scheduling_mode = 'min' if "loss" in self.scheduling_criterion else 'max' ######################### # Testing / Plotting # ######################### # set the top-n-epochs to be saved for temporal averaging in testing. self.save_n_models = 5 self.test_n_epochs = 5 # set a minimum epoch number for saving in case of instabilities in the first phase of training. self.min_save_thresh = 0 if self.dim == 2 else 0 self.report_score_level = ['patient', 'rois'] # choose list from 'patient', 'rois' self.class_dict = {1: 'benign', 2: 'malignant'} # 0 is background. self.patient_class_of_interest = 2 # patient metrics are only plotted for one class. self.ap_match_ious = [0.1] # list of ious to be evaluated for ap-scoring. self.model_selection_criteria = ['benign_ap', 'malignant_ap'] # criteria to average over for saving epochs. self.min_det_thresh = 0.1 # minimum confidence value to select predictions for evaluation. # threshold for clustering predictions together (wcs = weighted cluster scoring). # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold). # if too high, preds of the same object are separate clusters. self.wcs_iou = 1e-5 self.plot_prediction_histograms = True self.plot_stat_curves = False ######################### # Data Augmentation # ######################### self.da_kwargs={ 'do_elastic_deform': True, 'alpha':(0., 1500.), 'sigma':(30., 50.), 'do_rotation':True, 'angle_x': (0., 2 * np.pi), 'angle_y': (0., 0), 'angle_z': (0., 0), 'do_scale': True, 'scale':(0.8, 1.1), 'random_crop':False, 'rand_crop_dist': (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3), 'border_mode_data': 'constant', 'border_cval_data': 0, 'order_data': 1 } if self.dim == 3: self.da_kwargs['do_elastic_deform'] = False self.da_kwargs['angle_x'] = (0, 0.0) self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!! self.da_kwargs['angle_z'] = (0., 2 * np.pi) ######################### # Add model specifics # ######################### {'detection_unet': self.add_det_unet_configs, 'mrcnn': self.add_mrcnn_configs, 'ufrcnn': self.add_mrcnn_configs, 'ufrcnn_surrounding': self.add_mrcnn_configs, 'retina_net': self.add_mrcnn_configs, 'retina_unet': self.add_mrcnn_configs, 'prob_detector': self.add_mrcnn_configs, }[self.model]() def add_det_unet_configs(self): self.learning_rate = [1e-4] * self.num_epochs # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median'] self.aggregation_operation = 'max' # max number of roi candidates to identify per image (slice in 2D, volume in 3D) self.n_roi_candidates = 3 if self.dim == 2 else 8 # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce') self.seg_loss_mode = 'dice_wce' # if <1, false positive predictions in foreground are penalized less. self.fp_dice_weight = 1 if self.dim == 2 else 1 self.wce_weights = [0.3, 1, 1] self.detection_min_confidence = self.min_det_thresh # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic). self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.head_classes = self.num_seg_classes def add_mrcnn_configs(self): # learning rate is a list with one entry per epoch. self.learning_rate = [3e-4] * self.num_epochs # disable mask head loss. (e.g. if no pixelwise annotations available) self.frcnn_mode = False # disable the re-sampling of mask proposals to original size for speed-up. # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching), # mask-outputs are optional. self.return_masks_in_val = True self.return_masks_in_test = False # set number of proposal boxes to plot after each epoch. self.n_plot_rpn_props = 0 if self.dim == 2 else 0 # number of classes for head networks: n_foreground_classes + 1 (background) self.head_classes = 3 # seg_classes hier refers to the first stage classifier (RPN) self.num_seg_classes = 2 # foreground vs. background # feature map strides per pyramid level are inferred from architecture. self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]} # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.) self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]} # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3. self.pyramid_levels = [0, 1, 2, 3] # number of feature maps in rpn. typically lowered in 3D to save gpu-memory. self.n_rpn_features = 512 if self.dim == 2 else 128 # anchor ratios and strides per position in feature maps. self.rpn_anchor_ratios = [0.5, 1., 2.] self.rpn_anchor_stride = 1 # Threshold for first stage (RPN) non-maximum suppression (NMS): LOWER == HARDER SELECTION self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7 # loss sampling settings. self.rpn_train_anchors_per_image = 64 #per batch element self.train_rois_per_image = 2 #per batch element self.roi_positive_ratio = 0.5 self.anchor_matching_iou = 0.7 # factor of top-k candidates to draw from per negative sample (stochastic-hard-example-mining). # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples. self.shem_poolsize = 10 self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3) self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5) self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10) self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1]]) self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1]]) if self.dim == 2: self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4] self.bbox_std_dev = self.bbox_std_dev[:4] self.window = self.window[:4] self.scale = self.scale[:4] # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element. self.pre_nms_limit = 3000 if self.dim == 2 else 6000 # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True, # since proposals of the entire batch are forwarded through second stage in as one "batch". self.roi_chunk_size = 800 if self.dim == 2 else 600 self.post_nms_rois_training = 500 if self.dim == 2 else 75 self.post_nms_rois_inference = 500 # Final selection of detections (refine_detections) self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30 # per batch element and class. self.detection_nms_threshold = 1e-5 # needs to be > 0, otherwise all predictions are one cluster. self.model_min_confidence = 0.1 if self.dim == 2: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride))] for stride in self.backbone_strides['xy']]) else: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride)), int(np.ceil(self.patch_size[2] / stride_z))] for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z'] )]) if self.model == 'ufrcnn': self.operate_stride1 = True self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.frcnn_mode = True if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector': # implement extra anchor-scales according to retina-net publication. self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['xy']] self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['z']] self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3 self.n_rpn_features = 256 if self.dim == 2 else 64 # pre-selection of detections for NMS-speedup. per entire batch. self.pre_nms_limit = 10000 if self.dim == 2 else 50000 # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002 self.anchor_matching_iou = 0.5 # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic). self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 if self.model == 'retina_unet': self.operate_stride1 = True diff --git a/predictor.py b/predictor.py index 695cb10..7908353 100644 --- a/predictor.py +++ b/predictor.py @@ -1,868 +1,869 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os import numpy as np import torch from scipy.stats import norm from collections import OrderedDict from multiprocessing import Pool import pickle import pandas as pd from plotting import plot_batch_prediction class Predictor: """ Prediction pipeline: - receives a patched patient image (n_patches, c, y, x, (z)) from patient data loader. - forwards patches through model in chunks of batch_size. (method: batch_tiling_forward) - unmolds predictions (boxes and segmentations) to original patient coordinates. (method: spatial_tiling_forward) Ensembling (mode == 'test'): - for inference, forwards 4 mirrored versions of image to through model and unmolds predictions afterwards accordingly (method: data_aug_forward) - for inference, loads multiple parameter-sets of the trained model corresponding to different epochs. for each parameter-set loops over entire test set, runs prediction pipeline for each patient. (method: predict_test_set) Consolidation of predictions: - consolidates a patient's predictions (boxes, segmentations) collected over patches, data_aug- and temporal ensembling, performs clustering and weighted averaging (external function: apply_wbc_to_patient) to obtain consistent outptus. - for 2D networks, consolidates box predictions to 3D cubes via clustering (adaption of non-maximum surpression). (external function: merge_2D_to_3D_preds_per_patient) Ground truth handling: - dissmisses any ground truth boxes returned by the model (happens in validation mode, patch-based groundtruth) - if provided by data loader, adds 3D ground truth to the final predictions to be passed to the evaluator. """ def __init__(self, cf, net, logger, mode): self.cf = cf self.logger = logger # mode is 'val' for patient-based validation/monitoring and 'test' for inference. self.mode = mode # model instance. In validation mode, contains parameters of current epoch. self.net = net # rank of current epoch loaded (for temporal averaging). this info is added to each prediction, # for correct weighting during consolidation. self.rank_ix = '0' # number of ensembled models. used to calculate the number of expected predictions per position # during consolidation of predictions. Default is 1 (no ensembling, e.g. in validation). self.n_ens = 1 if self.mode == 'test': try: self.epoch_ranking = np.load(os.path.join(self.cf.fold_dir, 'epoch_ranking.npy'))[:cf.test_n_epochs] except: raise RuntimeError('no epoch ranking file in fold directory. ' 'seems like you are trying to run testing without prior training...') self.n_ens = cf.test_n_epochs if self.cf.test_aug: self.n_ens *= 4 self.example_plot_dir = os.path.join(cf.test_dir, "example_plots") os.makedirs(self.example_plot_dir, exist_ok=True) def predict_patient(self, batch): """ predicts one patient. called either directly via loop over validation set in exec.py (mode=='val') or from self.predict_test_set (mode=='test). in val mode: adds 3D ground truth info to predictions and runs consolidation and 2Dto3D merging of predictions. in test mode: returns raw predictions (ground truth addition, consolidation, 2D to 3D merging are done in self.predict_test_set, because patient predictions across several epochs might be needed to be collected first, in case of temporal ensembling). :return. results_dict: stores the results for one patient. dictionary with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z)) - losses (only in validation mode) """ #self.logger.info('\revaluating patient {} for fold {} '.format(batch['pid'], self.cf.fold)) print('\revaluating patient {} for fold {} '.format(batch['pid'], self.cf.fold), end="", flush=True) # True if patient is provided in patches and predictions need to be tiled. self.patched_patient = True if 'patch_crop_coords' in list(batch.keys()) else False # forward batch through prediction pipeline. results_dict = self.data_aug_forward(batch) if self.mode == 'val': for b in range(batch['patient_bb_target'].shape[0]): for t in range(len(batch['patient_bb_target'][b])): results_dict['boxes'][b].append({'box_coords': batch['patient_bb_target'][b][t], 'box_label': batch['patient_roi_labels'][b][t], 'box_type': 'gt'}) if self.patched_patient: wcs_input = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.wcs_iou, self.n_ens] results_dict['boxes'] = apply_wbc_to_patient(wcs_input)[0] if self.cf.merge_2D_to_3D_preds: merge_dims_inputs = [results_dict['boxes'], 'dummy_pid', self.cf.class_dict, self.cf.merge_3D_iou] results_dict['boxes'] = merge_2D_to_3D_preds_per_patient(merge_dims_inputs)[0] return results_dict def predict_test_set(self, batch_gen, return_results=True): """ wrapper around test method, which loads multiple (or one) epoch parameters (temporal ensembling), loops through the test set and collects predictions per patient. Also flattens the results per patient and epoch and adds optional ground truth boxes for evaluation. Saves out the raw result list for later analysis and optionally consolidates and returns predictions immediately. :return: (optionally) list_of_results_per_patient: list over patient results. each entry is a dict with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation. """ dict_of_patient_results = OrderedDict() # get paths of all parameter sets to be loaded for temporal ensembling. (or just one for no temp. ensembling). weight_paths = [os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch), 'params.pth') for epoch in self.epoch_ranking] n_test_plots = min(batch_gen['n_test'], 1) for rank_ix, weight_path in enumerate(weight_paths): self.logger.info(('tmp ensembling over rank_ix:{} epoch:{}'.format(rank_ix, weight_path))) self.net.load_state_dict(torch.load(weight_path)) self.net.eval() self.rank_ix = str(rank_ix) # get string of current rank for unique patch ids. plot_batches = np.random.choice(np.arange(batch_gen['n_test']), size=n_test_plots, replace=False) with torch.no_grad(): for i in range(batch_gen['n_test']): batch = next(batch_gen['test']) # store batch info in patient entry of results dict. if rank_ix == 0: dict_of_patient_results[batch['pid']] = {} dict_of_patient_results[batch['pid']]['results_dicts'] = [] dict_of_patient_results[batch['pid']]['patient_bb_target'] = batch['patient_bb_target'] dict_of_patient_results[batch['pid']]['patient_roi_labels'] = batch['patient_roi_labels'] # call prediction pipeline and store results in dict. results_dict = self.predict_patient(batch) dict_of_patient_results[batch['pid']]['results_dicts'].append({"boxes": results_dict['boxes']}) if i in plot_batches and (not self.patched_patient or 'patient_data' in batch.keys()): try: # view qualitative results of random test case out_file = os.path.join(self.example_plot_dir, 'batch_example_test_{}_rank_{}.png'.format(self.cf.fold, rank_ix)) plot_batch_prediction(batch, results_dict, self.cf, outfile=out_file) except Exception as e: self.logger.info("WARNING: error in plotting example test batch: {}".format(e)) self.logger.info('finished predicting test set. starting post-processing of predictions.') results_per_patient = [] # loop over patients again to flatten results across epoch predictions. # if provided, add ground truth boxes for evaluation. for pid, p_dict in dict_of_patient_results.items(): tmp_ens_list = p_dict['results_dicts'] results_dict = {} # collect all boxes/seg_preds of same batch_instance over temporal instances. b_size = len(tmp_ens_list[0]["boxes"]) results_dict['boxes'] = [[item for rank_dict in tmp_ens_list for item in rank_dict["boxes"][batch_instance]] for batch_instance in range(b_size)] # TODO return for instance segmentation: # results_dict['seg_preds'] = np.mean(results_dict['seg_preds'], 1)[:, None] # results_dict['seg_preds'] = np.array([[item for d in tmp_ens_list for item in d['seg_preds'][batch_instance]] # for batch_instance in range(len(tmp_ens_list[0]['boxes']))]) # add 3D ground truth boxes for evaluation. for b in range(p_dict['patient_bb_target'].shape[0]): for t in range(len(p_dict['patient_bb_target'][b])): results_dict['boxes'][b].append({'box_coords': p_dict['patient_bb_target'][b][t], 'box_label': p_dict['patient_roi_labels'][b][t], 'box_type': 'gt'}) results_per_patient.append([results_dict, pid]) # save out raw predictions. out_string = 'raw_pred_boxes_hold_out_list' if self.cf.hold_out_test_set else 'raw_pred_boxes_list' with open(os.path.join(self.cf.fold_dir, '{}.pickle'.format(out_string)), 'wb') as handle: pickle.dump(results_per_patient, handle) if return_results: final_patient_box_results = [(res_dict["boxes"], pid) for res_dict, pid in results_per_patient] # consolidate predictions. self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format( self.cf.wcs_iou, self.n_ens)) pool = Pool(processes=6) mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, self.n_ens] for ii in final_patient_box_results] final_patient_box_results = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1) pool.close() pool.join() # merge 2D boxes to 3D cubes. (if model predicts 2D but evaluation is run in 3D) if self.cf.merge_2D_to_3D_preds: self.logger.info('applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou)) pool = Pool(processes=6) mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in final_patient_box_results] final_patient_box_results = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1) pool.close() pool.join() # final_patient_box_results holds [avg_boxes, pid] if wbc for ix in range(len(results_per_patient)): assert results_per_patient[ix][1] == final_patient_box_results[ix][1], "should be same pid" results_per_patient[ix][0]["boxes"] = final_patient_box_results[ix][0] return results_per_patient def load_saved_predictions(self, apply_wbc=False): """ loads raw predictions saved by self.predict_test_set. consolidates and merges 2D boxes to 3D cubes for evaluation. (if model predicts 2D but evaluation is run in 3D) :return: (optionally) results_list: list over patient results. each entry is a dict with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions (if not merged to 3D), and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': not implemented yet. todo for evaluation of instance/semantic segmentation. """ # load predictions for a single test-set fold. - if not self.cf.hold_out_test_set: - with open(os.path.join(self.cf.fold_dir, 'raw_pred_boxes_list.pickle'), 'rb') as handle: + results_file = 'raw_pred_boxes_hold_out_list.pickle' if self.cf.hold_out_test_set else 'raw_pred_boxes_list.pickle' + if not self.cf.hold_out_test_set or not self.cf.ensemble_folds: + with open(os.path.join(self.cf.fold_dir, results_file), 'rb') as handle: results_list = pickle.load(handle) box_results_list = [(res_dict["boxes"], pid) for res_dict, pid in results_list] da_factor = 4 if self.cf.test_aug else 1 n_ens = self.cf.test_n_epochs * da_factor self.logger.info('loaded raw test set predictions with n_patients = {} and n_ens = {}'.format( len(results_list), n_ens)) # if hold out test set was perdicted, aggregate predictions of all trained models # corresponding to all CV-folds and flatten them. else: - self.logger.info("loading saved predictions of hold-out test set") + self.logger.info("loading saved predictions of hold-out test set and ensembling over folds.") fold_dirs = sorted([os.path.join(self.cf.exp_dir, f) for f in os.listdir(self.cf.exp_dir) if os.path.isdir(os.path.join(self.cf.exp_dir, f)) and f.startswith("fold")]) results_list = [] folds_loaded = 0 for fold in range(self.cf.n_cv_splits): fold_dir = os.path.join(self.cf.exp_dir, 'fold_{}'.format(fold)) if fold_dir in fold_dirs: - with open(os.path.join(fold_dir, 'raw_pred_boxes_hold_out_list.pickle'), 'rb') as handle: + with open(os.path.join(fold_dir, results_file), 'rb') as handle: fold_list = pickle.load(handle) results_list += fold_list folds_loaded += 1 else: self.logger.info("Skipping fold {} since no saved predictions found.".format(fold)) box_results_list = [] for res_dict, pid in results_list: #without filtering gt out: box_results_list.append((res_dict['boxes'], pid)) da_factor = 4 if self.cf.test_aug else 1 n_ens = self.cf.test_n_epochs * da_factor * folds_loaded # consolidate predictions. if apply_wbc: self.logger.info('applying wcs to test set predictions with iou = {} and n_ens = {}.'.format( self.cf.wcs_iou, n_ens)) pool = Pool(processes=6) mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.wcs_iou, n_ens] for ii in box_results_list] box_results_list = pool.map(apply_wbc_to_patient, mp_inputs, chunksize=1) pool.close() pool.join() # merge 2D box predictions to 3D cubes (if model predicts 2D but evaluation is run in 3D) if self.cf.merge_2D_to_3D_preds: self.logger.info( 'applying 2Dto3D merging to test set predictions with iou = {}.'.format(self.cf.merge_3D_iou)) pool = Pool(processes=6) mp_inputs = [[ii[0], ii[1], self.cf.class_dict, self.cf.merge_3D_iou] for ii in box_results_list] box_results_list = pool.map(merge_2D_to_3D_preds_per_patient, mp_inputs, chunksize=1) pool.close() pool.join() for ix in range(len(results_list)): assert np.all( results_list[ix][1] == box_results_list[ix][1]), "pid mismatch between loaded and aggregated results" results_list[ix][0]["boxes"] = box_results_list[ix][0] return results_list # holds (results_dict, pid) def data_aug_forward(self, batch): """ in val_mode: passes batch through to spatial_tiling method without data_aug. in test_mode: if cf.test_aug is set in configs, createst 4 mirrored versions of the input image, passes all of them to the next processing step (spatial_tiling method) and re-transforms returned predictions to original image version. :return. results_dict: stores the results for one patient. dictionary with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions, and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z)) - losses (only in validation mode) """ patch_crops = batch['patch_crop_coords'] if self.patched_patient else None results_list = [self.spatial_tiling_forward(batch, patch_crops)] org_img_shape = batch['original_img_shape'] if self.mode == 'test' and self.cf.test_aug: if self.patched_patient: # apply mirror transformations to patch-crop coordinates, for correct tiling in spatial_tiling method. mirrored_patch_crops = get_mirrored_patch_crops(patch_crops, batch['original_img_shape']) else: mirrored_patch_crops = [None] * 3 img = np.copy(batch['data']) # first mirroring: y-axis. batch['data'] = np.flip(img, axis=2).copy() chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[0], n_aug='1') # re-transform coordinates. for ix in range(len(chunk_dict['boxes'])): for boxix in range(len(chunk_dict['boxes'][ix])): coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy() coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2] coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0] assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] chunk_dict['boxes'][ix][boxix]['box_coords'] = coords # re-transform segmentation predictions. chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=2) results_list.append(chunk_dict) # second mirroring: x-axis. batch['data'] = np.flip(img, axis=3).copy() chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[1], n_aug='2') # re-transform coordinates. for ix in range(len(chunk_dict['boxes'])): for boxix in range(len(chunk_dict['boxes'][ix])): coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy() coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3] coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1] assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] chunk_dict['boxes'][ix][boxix]['box_coords'] = coords # re-transform segmentation predictions. chunk_dict['seg_preds'] = np.flip(chunk_dict['seg_preds'], axis=3) results_list.append(chunk_dict) # third mirroring: y-axis and x-axis. batch['data'] = np.flip(np.flip(img, axis=2), axis=3).copy() chunk_dict = self.spatial_tiling_forward(batch, mirrored_patch_crops[2], n_aug='3') # re-transform coordinates. for ix in range(len(chunk_dict['boxes'])): for boxix in range(len(chunk_dict['boxes'][ix])): coords = chunk_dict['boxes'][ix][boxix]['box_coords'].copy() coords[0] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][2] coords[2] = org_img_shape[2] - chunk_dict['boxes'][ix][boxix]['box_coords'][0] coords[1] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][3] coords[3] = org_img_shape[3] - chunk_dict['boxes'][ix][boxix]['box_coords'][1] assert coords[2] >= coords[0], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] assert coords[3] >= coords[1], [coords, chunk_dict['boxes'][ix][boxix]['box_coords'].copy()] chunk_dict['boxes'][ix][boxix]['box_coords'] = coords # re-transform segmentation predictions. chunk_dict['seg_preds'] = np.flip(np.flip(chunk_dict['seg_preds'], axis=2), axis=3).copy() results_list.append(chunk_dict) batch['data'] = img # aggregate all boxes/seg_preds per batch element from data_aug predictions. results_dict = {} results_dict['boxes'] = [[item for d in results_list for item in d['boxes'][batch_instance]] for batch_instance in range(org_img_shape[0])] results_dict['seg_preds'] = np.array([[item for d in results_list for item in d['seg_preds'][batch_instance]] for batch_instance in range(org_img_shape[0])]) if self.mode == 'val': try: results_dict['torch_loss'] = results_list[0]['torch_loss'] results_dict['class_loss'] = results_list[0]['class_loss'] except KeyError: pass return results_dict def spatial_tiling_forward(self, batch, patch_crops=None, n_aug='0'): """ forwards batch to batch_tiling_forward method and receives and returns a dictionary with results. if patch-based prediction, the results received from batch_tiling_forward will be on a per-patch-basis. this method uses the provided patch_crops to re-transform all predictions to whole-image coordinates. Patch-origin information of all box-predictions will be needed for consolidation, hence it is stored as 'patch_id', which is a unique string for each patch (also takes current data aug and temporal epoch instances into account). all box predictions get additional information about the amount overlapping patches at the respective position (used for consolidation). :return. results_dict: stores the results for one patient. dictionary with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions, and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z)) - losses (only in validation mode) """ if patch_crops is not None: patches_dict = self.batch_tiling_forward(batch) results_dict = {'boxes': [[] for _ in range(batch['original_img_shape'][0])]} # instanciate segemntation output array. Will contain averages over patch predictions. out_seg_preds = np.zeros(batch['original_img_shape'], dtype=np.float16)[:, 0][:, None] # counts patch instances per pixel-position. patch_overlap_map = np.zeros_like(out_seg_preds, dtype='uint8') #unmold segmentation outputs. loop over patches. for pix, pc in enumerate(patch_crops): if self.cf.dim == 3: out_seg_preds[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += patches_dict['seg_preds'][pix][None] patch_overlap_map[:, :, pc[0]:pc[1], pc[2]:pc[3], pc[4]:pc[5]] += 1 else: out_seg_preds[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += patches_dict['seg_preds'][pix] patch_overlap_map[pc[4]:pc[5], :, pc[0]:pc[1], pc[2]:pc[3], ] += 1 # take mean in overlapping areas. out_seg_preds[patch_overlap_map > 0] /= patch_overlap_map[patch_overlap_map > 0] results_dict['seg_preds'] = out_seg_preds # unmold box outputs. loop over patches. for pix, pc in enumerate(patch_crops): patch_boxes = patches_dict['boxes'][pix] for box in patch_boxes: # add unique patch id for consolidation of predictions. box['patch_id'] = self.rank_ix + '_' + n_aug + '_' + str(pix) # boxes from the edges of a patch have a lower prediction quality, than the ones at patch-centers. # hence they will be downweighted for consolidation, using the 'box_patch_center_factor', which is # obtained by a normal distribution over positions in the patch and average over spatial dimensions. # Also the info 'box_n_overlaps' is stored for consolidation, which depicts the amount over # overlapping patches at the box's position. c = box['box_coords'] box_centers = [(c[ii] + c[ii + 2]) / 2 for ii in range(2)] if self.cf.dim == 3: box_centers.append((c[4] + c[5]) / 2) box['box_patch_center_factor'] = np.mean( [norm.pdf(bc, loc=pc, scale=pc * 0.8) * np.sqrt(2 * np.pi) * pc * 0.8 for bc, pc in zip(box_centers, np.array(self.cf.patch_size) / 2)]) if self.cf.dim == 3: c += np.array([pc[0], pc[2], pc[0], pc[2], pc[4], pc[4]]) int_c = [int(np.floor(ii)) if ix%2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)] box['box_n_overlaps'] = np.mean(patch_overlap_map[:, :, int_c[1]:int_c[3], int_c[0]:int_c[2], int_c[4]:int_c[5]]) results_dict['boxes'][0].append(box) else: c += np.array([pc[0], pc[2], pc[0], pc[2]]) int_c = [int(np.floor(ii)) if ix % 2 == 0 else int(np.ceil(ii)) for ix, ii in enumerate(c)] box['box_n_overlaps'] = np.mean(patch_overlap_map[pc[4], :, int_c[1]:int_c[3], int_c[0]:int_c[2]]) results_dict['boxes'][pc[4]].append(box) if self.mode == 'val': try: results_dict['torch_loss'] = patches_dict['torch_loss'] results_dict['class_loss'] = patches_dict['class_loss'] except KeyError: pass # if predictions are not patch-based: # add patch-origin info to boxes (entire image is the same patch with overlap=1) and return results. else: results_dict = self.batch_tiling_forward(batch) for b in results_dict['boxes']: for box in b: box['box_patch_center_factor'] = 1 box['box_n_overlaps'] = 1 box['patch_id'] = self.rank_ix + '_' + n_aug return results_dict def batch_tiling_forward(self, batch): """ calls the actual network forward method. in patch-based prediction, the batch dimension might be overladed with n_patches >> batch_size, which would exceed gpu memory. In this case, batches are processed in chunks of batch_size. validation mode calls the train method to monitor losses (returned ground truth objects are discarded). test mode calls the test forward method, no ground truth required / involved. :return. results_dict: stores the results for one patient. dictionary with keys: - 'boxes': list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions, and a dummy batch dimension of 1 for 3D predictions. - 'seg_preds': pixel-wise predictions. (b, 1, y, x, (z)) - losses (only in validation mode) """ #self.logger.info('forwarding (patched) patient with shape: {}'.format(batch['data'].shape)) img = batch['data'] if img.shape[0] <= self.cf.batch_size: if self.mode == 'val': # call training method to monitor losses results_dict = self.net.train_forward(batch, is_validation=True) # discard returned ground-truth boxes (also training info boxes). results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']] else: results_dict = self.net.test_forward(batch, return_masks=self.cf.return_masks_in_test) else: split_ixs = np.split(np.arange(img.shape[0]), np.arange(img.shape[0])[::self.cf.batch_size]) chunk_dicts = [] for chunk_ixs in split_ixs[1:]: # first split is elements before 0, so empty b = {k: batch[k][chunk_ixs] for k in batch.keys() if (isinstance(batch[k], np.ndarray) and batch[k].shape[0] == img.shape[0])} if self.mode == 'val': chunk_dicts += [self.net.train_forward(b, is_validation=True)] else: chunk_dicts += [self.net.test_forward(b, return_masks=self.cf.return_masks_in_test)] results_dict = {} # flatten out batch elements from chunks ([chunk, chunk] -> [b, b, b, b, ...]) results_dict['boxes'] = [item for d in chunk_dicts for item in d['boxes']] results_dict['seg_preds'] = np.array([item for d in chunk_dicts for item in d['seg_preds']]) if self.mode == 'val': try: # estimate metrics by mean over batch_chunks. Most similar to training metrics. results_dict['torch_loss'] = torch.mean(torch.cat([d['torch_loss'] for d in chunk_dicts])) results_dict['class_loss'] = np.mean([d['class_loss'] for d in chunk_dicts]) except KeyError: # losses are not necessarily monitored pass # discard returned ground-truth boxes (also training info boxes). results_dict['boxes'] = [[box for box in b if box['box_type'] == 'det'] for b in results_dict['boxes']] return results_dict def apply_wbc_to_patient(inputs): """ wrapper around prediction box consolidation: weighted cluster scoring (wcs). processes a single patient. loops over batch elements in patient results (1 in 3D, slices in 2D) and foreground classes, aggregates and stores results in new list. :return. patient_results_list: list over batch elements. each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. batch elements are slices for 2D predictions, and a dummy batch dimension of 1 for 3D predictions. :return. pid: string. patient id. """ in_patient_results_list, pid, class_dict, wcs_iou, n_ens = inputs out_patient_results_list = [[] for _ in range(len(in_patient_results_list))] for bix, b in enumerate(in_patient_results_list): for cl in list(class_dict.keys()): boxes = [(ix, box) for ix, box in enumerate(b) if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)] box_coords = np.array([b[1]['box_coords'] for b in boxes]) box_scores = np.array([b[1]['box_score'] for b in boxes]) box_center_factor = np.array([b[1]['box_patch_center_factor'] for b in boxes]) box_n_overlaps = np.array([b[1]['box_n_overlaps'] for b in boxes]) box_patch_id = np.array([b[1]['patch_id'] for b in boxes]) if 0 not in box_scores.shape: keep_scores, keep_coords = weighted_box_clustering( np.concatenate((box_coords, box_scores[:, None], box_center_factor[:, None], box_n_overlaps[:, None]), axis=1), box_patch_id, wcs_iou, n_ens) for boxix in range(len(keep_scores)): out_patient_results_list[bix].append({'box_type': 'det', 'box_coords': keep_coords[boxix], 'box_score': keep_scores[boxix], 'box_pred_class_id': cl}) # add gt boxes back to new output list. out_patient_results_list[bix].extend([box for box in b if box['box_type'] == 'gt']) return [out_patient_results_list, pid] def merge_2D_to_3D_preds_per_patient(inputs): """ wrapper around 2Dto3D merging operation. Processes a single patient. Takes 2D patient results (slices in batch dimension) and returns 3D patient results (dummy batch dimension of 1). Applies an adaption of Non-Maximum Surpression (Detailed methodology is described in nms_2to3D). :return. results_dict_boxes: list over batch elements (1 in 3D). each element is a list over boxes, where each box is one dictionary: [[box_0, ...], [box_n,...]]. :return. pid: string. patient id. """ in_patient_results_list, pid, class_dict, merge_3D_iou = inputs out_patient_results_list = [] for cl in list(class_dict.keys()): boxes, slice_ids = [], [] # collect box predictions over batch dimension (slices) and store slice info as slice_ids. for bix, b in enumerate(in_patient_results_list): det_boxes = [(ix, box) for ix, box in enumerate(b) if (box['box_type'] == 'det' and box['box_pred_class_id'] == cl)] boxes += det_boxes slice_ids += [bix] * len(det_boxes) box_coords = np.array([b[1]['box_coords'] for b in boxes]) box_scores = np.array([b[1]['box_score'] for b in boxes]) slice_ids = np.array(slice_ids) if 0 not in box_scores.shape: keep_ix, keep_z = nms_2to3D( np.concatenate((box_coords, box_scores[:, None], slice_ids[:, None]), axis=1), merge_3D_iou) else: keep_ix, keep_z = [], [] # store kept predictions in new results list and add corresponding z-dimension info to coordinates. for kix, kz in zip(keep_ix, keep_z): out_patient_results_list.append({'box_type': 'det', 'box_coords': list(box_coords[kix]) + kz, 'box_score': box_scores[kix], 'box_pred_class_id': cl}) gt_boxes = [box for b in in_patient_results_list for box in b if box['box_type'] == 'gt'] if len(gt_boxes) > 0: assert np.all([len(box["box_coords"]) == 6 for box in gt_boxes]), "expanded preds to 3D but GT is 2D." out_patient_results_list += gt_boxes # add dummy batch dimension 1 for 3D. return [[out_patient_results_list], pid] def weighted_box_clustering(dets, box_patch_id, thresh, n_ens): """ consolidates overlapping predictions resulting from patch overlaps, test data augmentations and temporal ensembling. clusters predictions together with iou > thresh (like in NMS). Output score and coordinate for one cluster are the average weighted by individual patch center factors (how trustworthy is this candidate measured by how centered its position the patch is) and the size of the corresponding box. The number of expected predictions at a position is n_data_aug * n_temp_ens * n_overlaps_at_position (1 prediction per unique patch). Missing predictions at a cluster position are defined as the number of unique patches in the cluster, which did not contribute any predict any boxes. :param dets: (n_dets, (y1, x1, y2, x2, (z1), (z2), scores, box_pc_facts, box_n_ovs) :param thresh: threshold for iou_matching. :param n_ens: number of models, that are ensembled. (-> number of expected predicitions per position) :return: keep_scores: (n_keep) new scores of boxes to be kept. :return: keep_coords: (n_keep, (y1, x1, y2, x2, (z1), (z2)) new coordinates of boxes to be kept. """ dim = 2 if dets.shape[1] == 7 else 3 y1 = dets[:, 0] x1 = dets[:, 1] y2 = dets[:, 2] x2 = dets[:, 3] scores = dets[:, -3] box_pc_facts = dets[:, -2] box_n_ovs = dets[:, -1] areas = (y2 - y1 + 1) * (x2 - x1 + 1) if dim == 3: z1 = dets[:, 4] z2 = dets[:, 5] areas *= (z2 - z1 + 1) # order is the sorted index. maps order to index o[1] = 24 (rank1, ix 24) order = scores.argsort()[::-1] keep = [] keep_scores = [] keep_coords = [] while order.size > 0: i = order[0] # higehst scoring element xx1 = np.maximum(x1[i], x1[order]) yy1 = np.maximum(y1[i], y1[order]) xx2 = np.minimum(x2[i], x2[order]) yy2 = np.minimum(y2[i], y2[order]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h if dim == 3: zz1 = np.maximum(z1[i], z1[order]) zz2 = np.minimum(z2[i], z2[order]) d = np.maximum(0.0, zz2 - zz1 + 1) inter *= d # overall between currently highest scoring box and all boxes. ovr = inter / (areas[i] + areas[order] - inter) # get all the predictions that match the current box to build one cluster. matches = np.argwhere(ovr > thresh) match_n_ovs = box_n_ovs[order[matches]] match_pc_facts = box_pc_facts[order[matches]] match_patch_id = box_patch_id[order[matches]] match_ov_facts = ovr[matches] match_areas = areas[order[matches]] match_scores = scores[order[matches]] # weight all socres in cluster by patch factors, and size. match_score_weights = match_ov_facts * match_areas * match_pc_facts match_scores *= match_score_weights # for the weigted average, scores have to be divided by the number of total expected preds at the position # of the current cluster. 1 Prediction per patch is expected. therefore, the number of ensembled models is # multiplied by the mean overlaps of patches at this position (boxes of the cluster might partly be # in areas of different overlaps). n_expected_preds = n_ens * np.mean(match_n_ovs) # the number of missing predictions is obtained as the number of patches, # which did not contribute any prediction to the current cluster. n_missing_preds = np.max((0, n_expected_preds - np.unique(match_patch_id).shape[0])) # missing preds are given the mean weighting # (expected prediction is the mean over all predictions in cluster). denom = np.sum(match_score_weights) + n_missing_preds * np.mean(match_score_weights) # compute weighted average score for the cluster avg_score = np.sum(match_scores) / denom # compute weighted average of coordinates for the cluster. now only take existing # predictions into account. avg_coords = [np.sum(y1[order[matches]] * match_scores) / np.sum(match_scores), np.sum(x1[order[matches]] * match_scores) / np.sum(match_scores), np.sum(y2[order[matches]] * match_scores) / np.sum(match_scores), np.sum(x2[order[matches]] * match_scores) / np.sum(match_scores)] if dim == 3: avg_coords.append(np.sum(z1[order[matches]] * match_scores) / np.sum(match_scores)) avg_coords.append(np.sum(z2[order[matches]] * match_scores) / np.sum(match_scores)) # some clusters might have very low scores due to high amounts of missing predictions. # filter out the with a conservative threshold, to speed up evaluation. if avg_score > 0.01: keep_scores.append(avg_score) keep_coords.append(avg_coords) # get index of all elements that were not matched and discard all others. inds = np.where(ovr <= thresh)[0] order = order[inds] return keep_scores, keep_coords def nms_2to3D(dets, thresh): """ Merges 2D boxes to 3D cubes. Therefore, boxes of all slices are projected into one slices. An adaptation of Non-maximum surpression is applied, where clusters are found (like in NMS) with an extra constrained, that surpressed boxes have to have 'connected' z-coordinates w.r.t the core slice (cluster center, highest scoring box). 'connected' z-coordinates are determined as the z-coordinates with predictions until the first coordinate, where no prediction was found. example: a cluster of predictions was found overlap > iou thresh in xy (like NMS). The z-coordinate of the highest scoring box is 50. Other predictions have 23, 46, 48, 49, 51, 52, 53, 56, 57. Only the coordinates connected with 50 are clustered to one cube: 48, 49, 51, 52, 53. (46 not because nothing was found in 47, so 47 is a 'hole', which interrupts the connection). Only the boxes corresponding to these coordinates are surpressed. All others are kept for building of further clusters. This algorithm works better with a certain min_confidence of predictions, because low confidence (e.g. noisy/cluttery) predictions can break the relatively strong assumption of defining cubes' z-boundaries at the first 'hole' in the cluster. :param dets: (n_detections, (y1, x1, y2, x2, scores, slice_id) :param thresh: iou matchin threshold (like in NMS). :return: keep: (n_keep) 1D tensor of indices to be kept. :return: keep_z: (n_keep, [z1, z2]) z-coordinates to be added to boxes, which are kept in order to form cubes. """ y1 = dets[:, 0] x1 = dets[:, 1] y2 = dets[:, 2] x2 = dets[:, 3] scores = dets[:, -2] slice_id = dets[:, -1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] keep_z = [] while order.size > 0: # order is the sorted index. maps order to index o[1] = 24 (rank1, ix 24) i = order[0] # pop higehst scoring element xx1 = np.maximum(x1[i], x1[order]) yy1 = np.maximum(y1[i], y1[order]) xx2 = np.minimum(x2[i], x2[order]) yy2 = np.minimum(y2[i], y2[order]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order] - inter) matches = np.argwhere(ovr > thresh) # get all the elements that match the current box and have a lower score slice_ids = slice_id[order[matches]] core_slice = slice_id[int(i)] upper_wholes = [ii for ii in np.arange(core_slice, np.max(slice_ids)) if ii not in slice_ids] lower_wholes = [ii for ii in np.arange(np.min(slice_ids), core_slice) if ii not in slice_ids] max_valid_slice_id = np.min(upper_wholes) if len(upper_wholes) > 0 else np.max(slice_ids) min_valid_slice_id = np.max(lower_wholes) if len(lower_wholes) > 0 else np.min(slice_ids) z_matches = matches[(slice_ids <= max_valid_slice_id) & (slice_ids >= min_valid_slice_id)] z1 = np.min(slice_id[order[z_matches]]) - 1 z2 = np.max(slice_id[order[z_matches]]) + 1 keep.append(i) keep_z.append([z1, z2]) order = np.delete(order, z_matches, axis=0) return keep, keep_z def get_mirrored_patch_crops(patch_crops, org_img_shape): """ apply 3 mirrror transformations (x-axis, y-axis, x&y-axis) to given patch crop coordinates and return the transformed coordinates. Handles 2D and 3D coordinates. :param patch_crops: list of crops: each element is a list of coordinates for given crop [[y1, x1, ...], [y1, x1, ..]] :param org_img_shape: shape of patient volume used as world coordinates. :return: list of mirrored patch crops: lenght=3. each element is a list of transformed patch crops. """ mirrored_patch_crops = [] # y-axis transform. mirrored_patch_crops.append([[org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], ii[2], ii[3]] if len(ii) == 4 else [org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], ii[2], ii[3], ii[4], ii[5]] for ii in patch_crops]) # x-axis transform. mirrored_patch_crops.append([[ii[0], ii[1], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2]] if len(ii) == 4 else [ii[0], ii[1], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2], ii[4], ii[5]] for ii in patch_crops]) # y-axis and x-axis transform. mirrored_patch_crops.append([[org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2]] if len(ii) == 4 else [org_img_shape[2] - ii[1], org_img_shape[2] - ii[0], org_img_shape[3] - ii[3], org_img_shape[3] - ii[2], ii[4], ii[5]] for ii in patch_crops]) return mirrored_patch_crops diff --git a/shell_scripts/job_starter.sh b/shell_scripts/job_starter.sh index d3ca0b4..bb0b80e 100644 --- a/shell_scripts/job_starter.sh +++ b/shell_scripts/job_starter.sh @@ -1,193 +1,193 @@ #!/bin/bash #wrapper for cluster_runner_....sh which copies job-specific, frequently changing files (e.g. configs.py) before the actual sbatch job #is submitted since the job might pend in queue before execution --> hazard of job-specific files being unintentionally changed during queue wait time. #positonal # -arg #1 identifies the folder name of the dataset-related code (e.g. >toy_exp< or >lidc_exp<) within the code source directory # -arg #2 is the experiment and first part of the job name, # optional args and flags: # -c / --create: (flag) whether to create the exp, i.e., if this is a new start of the exp with configs etc from source dir. # -f / --folds FOLDS: (option) fold(s) to run on (FOLDS needs to be only one int or string of multiple ints separated by space), default None (-->set to all in config) # -m / --mode MODE: (option) string, one of "train", "train_test", "test", defaults to "train_test" # -p / --exp_parent_dir: (option) name of parent_dir rel to dataset folder on cluster. exp_dir is exp_parent_dir/exp_name, if not given defaults to "experiments" # -q / --queue: (option) which queue (-q parameter for bsub) to send job to. default: gputest. others: gputest-short (max 5h jobs). # -w / --which: (option) same as argument -m to bsub; host or host list (string separated by space) to send the job to. # use nodenameXX where XX==nr of node or nodenameXX,nodenameYY,... or nodename[XX-YY]. nodename is e.g. e132-comp. # --gmem: (option) how much gpu memory to request for job (in gigabytes), defaults to 11.9. Currently, the smaller nodes have 11.9G, the larger ones 31.7G. # --resume: (flag) only with explicit fold argument, if set, resumes from checkpoint in exp_dir/fold_x/last_state.pth. # --no_parallel: (flag) if set, folds won't start as parallel jobs on cluster, but run sequentially in one job. dataset_name="${1}" exp_name="${2}" #arguments not passed, e.g. $7 if no seventh argument, are null. if [ ! -z "${18}" ]; then #-z checks if is null string echo "Error: Received too many arguments." exit fi #make args optional: move up if some args are missing inbetween while [ ${#} -gt 2 ]; do case "${3}" in -c|--create) create_exp="c" shift ;; -f|--folds) folds="${4}" shift; shift ;; -m|--mode) mode="${4}" shift; shift ;; -p|--exp_parent_dir) exp_parent_dir="${4}" shift; shift ;; -q|--queue) queue="${4}" shift; shift ;; -w|--which) which="${4}" shift; shift ;; -R|--resource) resource="${4}" shift; shift ;; --gmem) gmem="${4}" shift; shift ;; --resume) resume=true shift ;; --no_parallel) no_parallel=true shift ;; *) echo "Invalid argument/option passed: ${3}" exit 1 ;; esac done # default values if [ -z ${exp_parent_dir} ]; then exp_parent_dir="experiments" fi if [ -z ${mode} ]; then mode="train_test" fi if [ -z ${queue} ]; then queue="gputest" fi if [ -z ${gmem} ]; then gmem="11" fi root_dir=/home/ramien #assumes /home/ramien exists prep_node=ramien@e132-comp07 #node used for prep tasks like create_exp #medicaldetectiontoolkit source_dir=${root_dir}/mdt-public dataset_abs_path=${source_dir}/experiments/${dataset_name} #set as second argument passed to this script exp_parent_dir=/datasets/datasets_ramien/${dataset_name}/${exp_parent_dir} #exp_parent_dir=/home/gregor/Documents/medicaldetectiontoolkit/datasets/${dataset_name}/experiments #for testing this script # /dataset is not mounted on log-in/job submission nodes (would maybe make sense, I feel), only on queue gputest's nodes e132-compXX. #ssh ${prep_node} "mkdir -p ${exp_parent_dir}" exp_dir=${exp_parent_dir}/${exp_name} #activate virtualenv that has all the packages: source_dl="module load python/3.7.0; module load gcc/7.2.0; source ${root_dir}/.virtualenvs/mdt/bin/activate;" # TODO as long as no fix available: this script needs to be started directly from the prep node. :/ would be nice if (most importantly # 'module ...') would also work over ssh, but somehow some commands are not availabe over the ssh-induced shell (even when using it as interactive). eval ${source_dl} # ssh: (not working) #create_cmd="ssh ${prep_node} '${source_dl} python ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path};'" # directly from prep node: create_cmd="python ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path};" #if create_exp, check if would overwrite existing exp_dir if [ ! -z ${create_exp} ] && [ ${create_exp} = "c" ]; then #-n doesnt work as replacement for !-z if [ -d ${exp_dir} ]; then echo "Please confirm to overwrite exp ${exp_name} settings, (Y/n): "; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Overwriting ${exp_name}" else echo "Exiting due to overwrite denial. Adjust options." exit fi fi #echo "opts: name ${exp_name}, ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path}" echo "Creating ${exp_name}" eval ${create_cmd} else if [ ! -d ${exp_dir} ]; then echo "Experiment directory ${exp_dir} does not exist." echo "Run create_exp? (Y/n): "; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Creating ${exp_name}" eval ${create_cmd} fi fi fi #if not create_exp, check if would overwrite existing folds (possibly valuable trained params!) if [ -z ${create_exp} ] && ([ ${mode} = "train" ] || [ ${mode} = "train_test" ]) && [ -z "${resume}" ]; then for f in ${folds}; do #if folds is null this check won't apply and folds will be quietly overwritten. if [ -d ${exp_dir}/fold_${f} ]; then #-d checks if is dir echo "please confirm to overwrite fold_${f}, (Y/n):"; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Overwriting "${exp_name}/fold_${f} else echo "Exiting due to overwrite denial. Adjust options." exit fi fi done fi bsub_opts="bsub -N -q ${queue} -gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=${gmem}G" if [ ! -z "$resource" ]; then bsub_opts=$bsub_opts $resource fi if [ ! -z ${which} ]; then bsub_opts="${bsub_opts} -m ${which}" fi #----- parallel/separate fold jobs (each fold in a single job) ----------- if [ ! -z "${folds}" ] && [ -z ${no_parallel} ]; then #WHY do i need to convert to string again? for f in ${folds}; do out_file=${exp_dir}/logs/fold_${f}_lsf_output.out bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} fold ${f} ${mode}' -oo '${out_file}'" eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${f} ${resume}" done #----- consecutive folds job (all folds in one single job) ----------- else if [ ! -z ${resume} ]; then echo "You need to explicitly specify folds if you would like to resume from a checkpoint. Exiting." exit fi - out_file=${exp_dir}/lsf_output.out + out_file=${exp_dir}/logs/lsf_output.out bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} folds ${folds} ${mode}' -oo '${out_file}'" eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${folds} ${resume}" echo "Started in no parallel, folds:" ${folds} fi diff --git a/utils/exp_utils.py b/utils/exp_utils.py index 27bed5c..eca38b4 100644 --- a/utils/exp_utils.py +++ b/utils/exp_utils.py @@ -1,419 +1,419 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import sys import subprocess import os import plotting import importlib.util import pickle import logging from torch.utils.tensorboard import SummaryWriter from collections import OrderedDict import numpy as np import torch import pandas as pd class CombinedLogger(object): """Combine console and tensorboard logger and record system metrics. """ def __init__(self, name, log_dir, server_env=True, fold="all"): self.pylogger = logging.getLogger(name) self.tboard = SummaryWriter(log_dir=os.path.join(log_dir, "tboard")) self.log_dir = log_dir self.fold = str(fold) self.server_env = server_env self.pylogger.setLevel(logging.DEBUG) self.log_file = os.path.join(log_dir, "fold_"+self.fold, 'exec.log') os.makedirs(os.path.dirname(self.log_file), exist_ok=True) self.pylogger.addHandler(logging.FileHandler(self.log_file)) if not server_env: self.pylogger.addHandler(ColorHandler()) else: self.pylogger.addHandler(logging.StreamHandler()) self.pylogger.propagate = False def __getattr__(self, attr): """delegate all undefined method requests to objects of this class in order pylogger, tboard (first find first serve). E.g., combinedlogger.add_scalars(...) should trigger self.tboard.add_scalars(...) """ for obj in [self.pylogger, self.tboard]: if attr in dir(obj): return getattr(obj, attr) print("logger attr not found") def set_logfile(self, fold=None, log_file=None): if fold is not None: self.fold = str(fold) if log_file is None: self.log_file = os.path.join(self.log_dir, "fold_"+self.fold, 'exec.log') else: self.log_file = log_file os.makedirs(os.path.dirname(self.log_file), exist_ok=True) for hdlr in self.pylogger.handlers: hdlr.close() self.pylogger.handlers = [] self.pylogger.addHandler(logging.FileHandler(self.log_file)) if not self.server_env: self.pylogger.addHandler(ColorHandler()) else: self.pylogger.addHandler(logging.StreamHandler()) def metrics2tboard(self, metrics, global_step=None, suptitle=None): """ :param metrics: {'train': dataframe, 'val':df}, df as produced in evaluator.py.evaluate_predictions """ # print("metrics", metrics) if global_step is None: global_step = len(metrics['train'][list(metrics['train'].keys())[0]]) - 1 if suptitle is not None: suptitle = str(suptitle) else: suptitle = "Fold_" + str(self.fold) for key in ['train', 'val']: # series = {k:np.array(v[-1]) for (k,v) in metrics[key].items() if not np.isnan(v[-1]) and not 'Bin_Stats' in k} loss_series = {} unc_series = {} bin_stat_series = {} mon_met_series = {} for tag, val in metrics[key].items(): val = val[-1] # maybe remove list wrapping, recording in evaluator? if 'loss' in tag.lower() and not np.isnan(val): loss_series["{}".format(tag)] = val elif not np.isnan(val): mon_met_series["{}".format(tag)] = val self.tboard.add_scalars(suptitle + "/Losses/{}".format(key), loss_series, global_step) self.tboard.add_scalars(suptitle + "/Monitor_Metrics/{}".format(key), mon_met_series, global_step) self.tboard.add_scalars(suptitle + "/Learning_Rate", metrics["lr"], global_step) return def __del__(self): # otherwise might produce multiple prints e.g. in ipython console for hdlr in self.pylogger.handlers: hdlr.close() self.pylogger.handlers = [] del self.pylogger self.tboard.flush() # close somehow prevents main script from exiting # maybe revise this issue in a later pytorch version #self.tboard.close() def get_logger(exp_dir, server_env=False): """ creates logger instance. writing out info to file, to terminal and to tensorboard. :param exp_dir: experiment directory, where exec.log file is stored. :param server_env: True if operating in server environment (e.g., gpu cluster) :return: custom CombinedLogger instance. """ log_dir = os.path.join(exp_dir, "logs") logger = CombinedLogger('medicaldetectiontoolkit', log_dir, server_env=server_env) print("Logging to {}".format(logger.log_file)) return logger def prep_exp(dataset_path, exp_path, server_env, use_stored_settings=True, is_training=True): """ I/O handling, creating of experiment folder structure. Also creates a snapshot of configs/model scripts and copies them to the exp_dir. This way the exp_dir contains all info needed to conduct an experiment, independent to changes in actual source code. Thus, training/inference of this experiment can be started at anytime. Therefore, the model script is copied back to the source code dir as tmp_model (tmp_backbone). Provides robust structure for cloud deployment. :param dataset_path: path to source code for specific data set. (e.g. medicaldetectiontoolkit/lidc_exp) :param exp_path: path to experiment directory. :param server_env: boolean flag. pass to configs script for cloud deployment. :param use_stored_settings: boolean flag. When starting training: If True, starts training from snapshot in existing experiment directory, else creates experiment directory on the fly using configs/model scripts from source code. :param is_training: boolean flag. distinguishes train vs. inference mode. :return: """ if is_training: if use_stored_settings: cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py')) cf = cf_file.configs(server_env) # in this mode, previously saved model and backbone need to be found in exp dir. if not os.path.isfile(os.path.join(exp_path, 'model.py')) or \ not os.path.isfile(os.path.join(exp_path, 'backbone.py')): raise Exception( "Selected use_stored_settings option but no model and/or backbone source files exist in exp dir.") cf.model_path = os.path.join(exp_path, 'model.py') cf.backbone_path = os.path.join(exp_path, 'backbone.py') else: # this case overwrites settings files in exp dir, i.e., default_configs, configs, backbone, model os.makedirs(exp_path, exist_ok=True) # run training with source code info and copy snapshot of model to exp_dir for later testing (overwrite scripts if exp_dir already exists.) subprocess.call('cp {} {}'.format('default_configs.py', os.path.join(exp_path, 'default_configs.py')), shell=True) subprocess.call( 'cp {} {}'.format(os.path.join(dataset_path, 'configs.py'), os.path.join(exp_path, 'configs.py')), shell=True) cf_file = import_module('cf_file', os.path.join(dataset_path, 'configs.py')) cf = cf_file.configs(server_env) subprocess.call('cp {} {}'.format(cf.model_path, os.path.join(exp_path, 'model.py')), shell=True) subprocess.call('cp {} {}'.format(cf.backbone_path, os.path.join(exp_path, 'backbone.py')), shell=True) if os.path.isfile(os.path.join(exp_path, "fold_ids.pickle")): subprocess.call('rm {}'.format(os.path.join(exp_path, "fold_ids.pickle")), shell=True) else: # testing, use model and backbone stored in exp dir. cf_file = import_module('cf_file', os.path.join(exp_path, 'configs.py')) cf = cf_file.configs(server_env) cf.model_path = os.path.join(exp_path, 'model.py') cf.backbone_path = os.path.join(exp_path, 'backbone.py') cf.exp_dir = exp_path cf.test_dir = os.path.join(cf.exp_dir, 'test') cf.plot_dir = os.path.join(cf.exp_dir, 'plots') if not os.path.exists(cf.test_dir): os.mkdir(cf.test_dir) if not os.path.exists(cf.plot_dir): os.mkdir(cf.plot_dir) cf.experiment_name = exp_path.split("/")[-1] cf.created_fold_id_pickle = False return cf def import_module(name, path): """ correct way of importing a module dynamically in python 3. :param name: name given to module instance. :param path: path to module. :return: module: returned module instance. """ spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module class ModelSelector: ''' saves a checkpoint after each epoch as 'last_state' (can be loaded to continue interrupted training). saves the top-k (k=cf.save_n_models) ranked epochs. In inference, predictions of multiple epochs can be ensembled to improve performance. ''' def __init__(self, cf, logger): self.cf = cf self.saved_epochs = [-1] * cf.save_n_models self.logger = logger def run_model_selection(self, net, optimizer, monitor_metrics, epoch): # take the mean over all selection criteria in each epoch non_nan_scores = np.mean(np.array([[0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][sc]] for sc in self.cf.model_selection_criteria]), 0) epochs_scores = [ii for ii in non_nan_scores[1:]] # ranking of epochs according to model_selection_criterion epoch_ranking = np.argsort(epochs_scores, kind="stable")[::-1] + 1 #epochs start at 1 # if set in configs, epochs < min_save_thresh are discarded from saving process. epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh] # check if current epoch is among the top-k epochs. if epoch in epoch_ranking[:self.cf.save_n_models]: save_dir = os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(epoch)) if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save(net.state_dict(), os.path.join(save_dir, 'params.pth')) with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle: pickle.dump(monitor_metrics, handle) # save epoch_ranking to keep info for inference. np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) self.logger.info( "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch))) # delete params of the epoch that just fell out of the top-k epochs. for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_checkpoint' in ii]: if se in epoch_ranking[self.cf.save_n_models:]: subprocess.call('rm -rf {}'.format(os.path.join(self.cf.fold_dir, '{}_best_checkpoint'.format(se))), shell=True) self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se))) state = { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), } # save checkpoint of current epoch. save_dir = os.path.join(self.cf.fold_dir, 'last_checkpoint'.format(epoch)) if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save(state, os.path.join(save_dir, 'params.pth')) np.save(os.path.join(save_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models]) with open(os.path.join(save_dir, 'monitor_metrics.pickle'), 'wb') as handle: pickle.dump(monitor_metrics, handle) def load_checkpoint(checkpoint_path, net, optimizer): checkpoint = torch.load(os.path.join(checkpoint_path, 'params.pth')) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) with open(os.path.join(checkpoint_path, 'monitor_metrics.pickle'), 'rb') as handle: monitor_metrics = pickle.load(handle) starting_epoch = checkpoint['epoch'] + 1 return starting_epoch, net, optimizer, monitor_metrics def prepare_monitoring(cf): """ creates dictionaries, where train/val metrics are stored. """ metrics = {} # first entry for loss dict accounts for epoch starting at 1. metrics['train'] = OrderedDict() metrics['val'] = OrderedDict() metric_classes = [] if 'rois' in cf.report_score_level: metric_classes.extend([v for k, v in cf.class_dict.items()]) if 'patient' in cf.report_score_level: metric_classes.extend(['patient']) for cl in metric_classes: metrics['train'][cl + '_ap'] = [np.nan] metrics['val'][cl + '_ap'] = [np.nan] if cl == 'patient': metrics['train'][cl + '_auc'] = [np.nan] metrics['val'][cl + '_auc'] = [np.nan] return metrics def create_csv_output(results_list, cf, logger): """ Write out test set predictions to .csv file. output format is one line per prediction: PatientID | PredictionID | [y1 x1 y2 x2 (z1) (z2)] | score | pred_classID Note, that prediction coordinates correspond to images as loaded for training/testing and need to be adapted when plotted over raw data (before preprocessing/resampling). :param results_list: [[patient_results, patient_id], [patient_results, patient_id], ...] """ - logger.info('creating csv output file at {}'.format(os.path.join(cf.exp_dir, 'results.csv'))) + logger.info('creating csv output file at {}'.format(os.path.join(cf.test_dir, 'results.csv'))) predictions_df = pd.DataFrame(columns = ['patientID', 'predictionID', 'coords', 'score', 'pred_classID']) for r in results_list: pid = r[1] #optionally load resampling info from preprocessing to match output predictions with raw data. #with open(os.path.join(cf.exp_dir, 'test_resampling_info', pid), 'rb') as handle: # resampling_info = pickle.load(handle) for bix, box in enumerate(r[0][0]): if box["box_type"] == "gt": continue assert box['box_type'] == 'det', box['box_type'] coords = box['box_coords'] score = box['box_score'] pred_class_id = box['box_pred_class_id'] out_coords = [] if score >= cf.min_det_thresh: out_coords.append(coords[0]) #* resampling_info['scale'][0]) out_coords.append(coords[1]) #* resampling_info['scale'][1]) out_coords.append(coords[2]) #* resampling_info['scale'][0]) out_coords.append(coords[3]) #* resampling_info['scale'][1]) if len(coords) > 4: out_coords.append(coords[4]) #* resampling_info['scale'][2] + resampling_info['z_crop']) out_coords.append(coords[5]) #* resampling_info['scale'][2] + resampling_info['z_crop']) predictions_df.loc[len(predictions_df)] = [pid, bix, out_coords, score, pred_class_id] try: fold = cf.fold except: fold = 'hold_out' predictions_df.to_csv(os.path.join(cf.exp_dir, 'results_{}.csv'.format(fold)), index=False) class _AnsiColorizer(object): """ A colorizer is an object that loosely wraps around a stream, allowing callers to write text to the stream in a particular color. Colorizer classes must implement C{supported()} and C{write(text, color)}. """ _colors = dict(black=30, red=31, green=32, yellow=33, blue=34, magenta=35, cyan=36, white=37, default=39) def __init__(self, stream): self.stream = stream @classmethod def supported(cls, stream=sys.stdout): """ A class method that returns True if the current platform supports coloring terminal output using this method. Returns False otherwise. """ if not stream.isatty(): return False # auto color only on TTYs try: import curses except ImportError: return False else: try: try: return curses.tigetnum("colors") > 2 except curses.error: curses.setupterm() return curses.tigetnum("colors") > 2 except: raise # guess false in case of error return False def write(self, text, color): """ Write the given text to the stream in the given color. @param text: Text to be written to the stream. @param color: A string label for a color. e.g. 'red', 'white'. """ color = self._colors[color] self.stream.write('\x1b[%sm%s\x1b[0m' % (color, text)) class ColorHandler(logging.StreamHandler): def __init__(self, stream=sys.stdout): super(ColorHandler, self).__init__(_AnsiColorizer(stream)) def emit(self, record): msg_colors = { logging.DEBUG: "green", logging.INFO: "default", logging.WARNING: "red", logging.ERROR: "red" } color = msg_colors.get(record.levelno, "blue") self.stream.write(record.msg + "\n", color)