diff --git a/experiments/lidc_exp/configs.py b/experiments/lidc_exp/configs.py index 10b9123..f4a12e7 100644 --- a/experiments/lidc_exp/configs.py +++ b/experiments/lidc_exp/configs.py @@ -1,334 +1,334 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import sys import os sys.path.append(os.path.dirname(os.path.realpath(__file__))) import numpy as np from default_configs import DefaultConfigs class configs(DefaultConfigs): def __init__(self, server_env=None): ######################### # Preprocessing # ######################### self.root_dir = '/home/gregor/networkdrives/E130-Personal/Goetz/Datenkollektive/Lungendaten/Nodules_LIDC_IDRI' - self.raw_data_dir = '{}/data_nrrd'.format(self.root_dir) - self.pp_dir = '{}/pp_norm'.format(self.root_dir) + self.raw_data_dir = '{}/new_nrrd'.format(self.root_dir) + self.pp_dir = '/media/gregor/HDD2TB/data/lidc/lidc_mdt' self.target_spacing = (0.7, 0.7, 1.25) ######################### # I/O # ######################### # one out of [2, 3]. dimension the model operates in. self.dim = 3 # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn']. self.model = 'mrcnn' DefaultConfigs.__init__(self, self.model, server_env, self.dim) # int [0 < dataset_size]. select n patients from dataset for prototyping. If None, all data is used. self.select_prototype_subset = None # path to preprocessed data. self.pp_name = 'lidc_mdt' self.input_df_name = 'info_df.pickle' self.pp_data_path = '/media/gregor/HDD2TB/data/lidc/{}'.format(self.pp_name) self.pp_test_data_path = self.pp_data_path #change if test_data in separate folder. # settings for deployment in cloud. if server_env: # path to preprocessed data. self.pp_name = 'pp_fg_slices' self.crop_name = 'pp_fg_slices_packed' self.pp_data_path = '/datasets/datasets_ramien/lidc/data/{}'.format(self.pp_name) self.pp_test_data_path = self.pp_data_path self.select_prototype_subset = None ######################### # Data Loader # ######################### # select modalities from preprocessed data self.channels = [0] self.n_channels = len(self.channels) # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation. self.pre_crop_size_2D = [300, 300] self.patch_size_2D = [288, 288] self.pre_crop_size_3D = [156, 156, 96] self.patch_size_3D = [128, 128, 64] self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D # ratio of free sampled batch elements before class balancing is triggered # (>0 to include "empty"/background patches.) self.batch_sample_slack = 0.2 # set 2D network to operate in 3D images. self.merge_2D_to_3D_preds = True # feed +/- n neighbouring slices into channel dimension. set to None for no context. self.n_3D_context = None if self.n_3D_context is not None and self.dim == 2: self.n_channels *= (self.n_3D_context * 2 + 1) ######################### # Architecture # ######################### self.start_filts = 48 if self.dim == 2 else 18 self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2 self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50' self.norm = None # one of None, 'instance_norm', 'batch_norm' self.weight_decay = 0 # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform') self.weight_init = None ######################### # Schedule / Selection # ######################### self.num_epochs = 100 self.num_train_batches = 200 if self.dim == 2 else 200 self.batch_size = 20 if self.dim == 2 else 8 self.do_validation = True # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training) # the former is morge accurate, while the latter is faster (depending on volume size) self.val_mode = 'val_sampling' # one of 'val_sampling' , 'val_patient' if self.val_mode == 'val_patient': self.max_val_patients = 50 # if 'None' iterates over entire val_set once. if self.val_mode == 'val_sampling': self.num_val_batches = 50 ######################### # Testing / Plotting # ######################### # set the top-n-epochs to be saved for temporal averaging in testing. self.save_n_models = 5 self.test_n_epochs = 5 # set a minimum epoch number for saving in case of instabilities in the first phase of training. self.min_save_thresh = 0 if self.dim == 2 else 0 self.report_score_level = ['patient', 'rois'] # choose list from 'patient', 'rois' self.class_dict = {1: 'benign', 2: 'malignant'} # 0 is background. self.patient_class_of_interest = 2 # patient metrics are only plotted for one class. self.ap_match_ious = [0.1] # list of ious to be evaluated for ap-scoring. self.model_selection_criteria = ['malignant_ap', 'benign_ap'] # criteria to average over for saving epochs. self.min_det_thresh = 0.1 # minimum confidence value to select predictions for evaluation. # threshold for clustering predictions together (wcs = weighted cluster scoring). # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold). # if too high, preds of the same object are separate clusters. self.wcs_iou = 1e-5 self.plot_prediction_histograms = True self.plot_stat_curves = False ######################### # Data Augmentation # ######################### self.da_kwargs={ 'do_elastic_deform': True, 'alpha':(0., 1500.), 'sigma':(30., 50.), 'do_rotation':True, 'angle_x': (0., 2 * np.pi), 'angle_y': (0., 0), 'angle_z': (0., 0), 'do_scale': True, 'scale':(0.8, 1.1), 'random_crop':False, 'rand_crop_dist': (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3), 'border_mode_data': 'constant', 'border_cval_data': 0, 'order_data': 1 } if self.dim == 3: self.da_kwargs['do_elastic_deform'] = False self.da_kwargs['angle_x'] = (0, 0.0) self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!! self.da_kwargs['angle_z'] = (0., 2 * np.pi) ######################### # Add model specifics # ######################### {'detection_unet': self.add_det_unet_configs, 'mrcnn': self.add_mrcnn_configs, 'ufrcnn': self.add_mrcnn_configs, 'retina_net': self.add_mrcnn_configs, 'retina_unet': self.add_mrcnn_configs, }[self.model]() def add_det_unet_configs(self): self.learning_rate = [1e-4] * self.num_epochs # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median'] self.aggregation_operation = 'max' # max number of roi candidates to identify per batch element and class. self.n_roi_candidates = 10 if self.dim == 2 else 30 # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce') self.seg_loss_mode = 'dice_wce' # if <1, false positive predictions in foreground are penalized less. self.fp_dice_weight = 1 if self.dim == 2 else 1 self.wce_weights = [1, 1, 1] self.detection_min_confidence = self.min_det_thresh # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic). self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.head_classes = self.num_seg_classes def add_mrcnn_configs(self): # learning rate is a list with one entry per epoch. self.learning_rate = [1e-4] * self.num_epochs # disable the re-sampling of mask proposals to original size for speed-up. # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching), # mask-outputs are optional. self.return_masks_in_val = True self.return_masks_in_test = False # set number of proposal boxes to plot after each epoch. self.n_plot_rpn_props = 5 if self.dim == 2 else 30 # number of classes for head networks: n_foreground_classes + 1 (background) self.head_classes = 3 # seg_classes hier refers to the first stage classifier (RPN) self.num_seg_classes = 2 # foreground vs. background # feature map strides per pyramid level are inferred from architecture. self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]} # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.) self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]} # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3. self.pyramid_levels = [0, 1, 2, 3] # number of feature maps in rpn. typically lowered in 3D to save gpu-memory. self.n_rpn_features = 512 if self.dim == 2 else 128 # anchor ratios and strides per position in feature maps. self.rpn_anchor_ratios = [0.5, 1, 2] self.rpn_anchor_stride = 1 # Threshold for first stage (RPN) non-maximum suppression (NMS): LOWER == HARDER SELECTION self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7 # loss sampling settings. self.rpn_train_anchors_per_image = 6 #per batch element self.train_rois_per_image = 6 #per batch element self.roi_positive_ratio = 0.5 self.anchor_matching_iou = 0.7 # factor of top-k candidates to draw from per negative sample (stochastic-hard-example-mining). # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples. self.shem_poolsize = 10 self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3) self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5) self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10) self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2]) self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1], 0, self.patch_size_3D[2]]) self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1], self.patch_size_3D[2], self.patch_size_3D[2]]) if self.dim == 2: self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4] self.bbox_std_dev = self.bbox_std_dev[:4] self.window = self.window[:4] self.scale = self.scale[:4] # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element. self.pre_nms_limit = 3000 if self.dim == 2 else 6000 # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True, # since proposals of the entire batch are forwarded through second stage in as one "batch". self.roi_chunk_size = 2500 if self.dim == 2 else 600 self.post_nms_rois_training = 500 if self.dim == 2 else 75 self.post_nms_rois_inference = 500 # Final selection of detections (refine_detections) self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30 # per batch element and class. self.detection_nms_threshold = 1e-5 # needs to be > 0, otherwise all predictions are one cluster. self.model_min_confidence = 0.1 if self.dim == 2: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride))] for stride in self.backbone_strides['xy']]) else: self.backbone_shapes = np.array( [[int(np.ceil(self.patch_size[0] / stride)), int(np.ceil(self.patch_size[1] / stride)), int(np.ceil(self.patch_size[2] / stride_z))] for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z'] )]) if self.model == 'ufrcnn': self.operate_stride1 = True self.class_specific_seg_flag = True self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 self.frcnn_mode = True if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector': # implement extra anchor-scales according to retina-net publication. self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['xy']] self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in self.rpn_anchor_scales['z']] self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3 self.n_rpn_features = 256 if self.dim == 2 else 64 # pre-selection of detections for NMS-speedup. per entire batch. self.pre_nms_limit = 10000 if self.dim == 2 else 50000 # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002 self.anchor_matching_iou = 0.5 # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic). self.num_seg_classes = 3 if self.class_specific_seg_flag else 2 if self.model == 'retina_unet': self.operate_stride1 = True diff --git a/experiments/lidc_exp/preprocessing.py b/experiments/lidc_exp/preprocessing.py index c12b9aa..975d474 100644 --- a/experiments/lidc_exp/preprocessing.py +++ b/experiments/lidc_exp/preprocessing.py @@ -1,146 +1,149 @@ #!/usr/bin/env python # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== ''' This preprocessing script loads nrrd files obtained by the data conversion tool: https://github.com/MIC-DKFZ/LIDC-IDRI-processing/tree/v1.0.1 After applying preprocessing, images are saved as numpy arrays and the meta information for the corresponding patient is stored as a line in the dataframe saved as info_df.pickle. ''' -import os +import os, sys +from pathlib import Path import SimpleITK as sitk import numpy as np from multiprocessing import Pool import pandas as pd import numpy.testing as npt from skimage.transform import resize import subprocess import pickle +PROJECT_ROOT = Path(__file__).absolute().parent.parent.parent +sys.path.append(str(PROJECT_ROOT)) import utils.exp_utils as utils def resample_array(src_imgs, src_spacing, target_spacing): src_spacing = np.round(src_spacing, 3) target_shape = [int(src_imgs.shape[ix] * src_spacing[::-1][ix] / target_spacing[::-1][ix]) for ix in range(len(src_imgs.shape))] for i in range(len(target_shape)): try: assert target_shape[i] > 0 except: raise AssertionError("AssertionError:", src_imgs.shape, src_spacing, target_spacing) img = src_imgs.astype(float) resampled_img = resize(img, target_shape, order=1, clip=True, mode='edge').astype('float32') return resampled_img def pp_patient(inputs): ix, path = inputs pid = path.split('/')[-1] img = sitk.ReadImage(os.path.join(path, '{}_ct_scan.nrrd'.format(pid))) img_arr = sitk.GetArrayFromImage(img) print('processing {}'.format(pid), img.GetSpacing(), img_arr.shape) img_arr = resample_array(img_arr, img.GetSpacing(), cf.target_spacing) img_arr = np.clip(img_arr, -1200, 600) #img_arr = (1200 + img_arr) / (600 + 1200) * 255 # a+x / (b-a) * (c-d) (c, d = new) img_arr = img_arr.astype(np.float32) img_arr = (img_arr - np.mean(img_arr)) / np.std(img_arr).astype(np.float16) df = pd.read_csv(os.path.join(cf.root_dir, 'characteristics.csv'), sep=';') df = df[df.PatientID == pid] final_rois = np.zeros_like(img_arr, dtype=np.uint8) mal_labels = [] roi_ids = set([ii.split('.')[0].split('_')[-1] for ii in os.listdir(path) if '.nii.gz' in ii]) rix = 1 for rid in roi_ids: roi_id_paths = [ii for ii in os.listdir(path) if '{}.nii'.format(rid) in ii] nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths] rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids] rater_labels.extend([0] * (4-len(rater_labels))) mal_label = np.mean([ii for ii in rater_labels if ii > -1]) roi_rater_list = [] for rp in roi_id_paths: roi = sitk.ReadImage(os.path.join(cf.raw_data_dir, pid, rp)) roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8) roi_arr = resample_array(roi_arr, roi.GetSpacing(), cf.target_spacing) assert roi_arr.shape == img_arr.shape, [roi_arr.shape, img_arr.shape, pid, roi.GetSpacing()] for ix in range(len(img_arr.shape)): npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix]) roi_rater_list.append(roi_arr) roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths))) roi_raters = np.array(roi_rater_list) roi_raters = np.mean(roi_raters, axis=0) roi_raters[roi_raters < 0.5] = 0 if np.sum(roi_raters) > 0: mal_labels.append(mal_label) final_rois[roi_raters >= 0.5] = rix rix += 1 else: # indicate rois suppressed by majority voting of raters print('suppressed roi!', roi_id_paths) with open(os.path.join(cf.pp_dir, 'suppressed_rois.txt'), 'a') as handle: handle.write(" ".join(roi_id_paths)) fg_slices = [ii for ii in np.unique(np.argwhere(final_rois != 0)[:, 0])] mal_labels = np.array(mal_labels) assert len(mal_labels) + 1 == len(np.unique(final_rois)), [len(mal_labels), np.unique(final_rois), pid] np.save(os.path.join(cf.pp_dir, '{}_rois.npy'.format(pid)), final_rois) np.save(os.path.join(cf.pp_dir, '{}_img.npy'.format(pid)), img_arr) with open(os.path.join(cf.pp_dir, 'meta_info_{}.pickle'.format(pid)), 'wb') as handle: meta_info_dict = {'pid': pid, 'class_target': mal_labels, 'spacing': img.GetSpacing(), 'fg_slices': fg_slices} pickle.dump(meta_info_dict, handle) def aggregate_meta_info(exp_dir): files = [os.path.join(exp_dir, f) for f in os.listdir(exp_dir) if 'meta_info' in f] df = pd.DataFrame(columns=['pid', 'class_target', 'spacing', 'fg_slices']) for f in files: with open(f, 'rb') as handle: df.loc[len(df)] = pickle.load(handle) df.to_pickle(os.path.join(exp_dir, 'info_df.pickle')) print ("aggregated meta info to df with length", len(df)) if __name__ == "__main__": cf_file = utils.import_module("cf", "configs.py") cf = cf_file.configs() paths = [os.path.join(cf.raw_data_dir, ii) for ii in os.listdir(cf.raw_data_dir)] if not os.path.exists(cf.pp_dir): os.mkdir(cf.pp_dir) pool = Pool(processes=os.cpu_count()) p1 = pool.map(pp_patient, enumerate(paths)) pool.close() pool.join() # for i in enumerate(paths): # pp_patient(i) aggregate_meta_info(cf.pp_dir) - subprocess.call('cp {} {}'.format(os.path.join(cf.pp_dir, 'info_df.pickle'), os.path.join(cf.pp_dir, 'info_df_bk.pickle')), shell=True) \ No newline at end of file + subprocess.call('cp {} {}'.format(os.path.join(cf.pp_dir, 'info_df.pickle'), os.path.join(cf.pp_dir, 'info_df_bk.pickle')), shell=True)