diff --git a/custom_extensions/nms/nms.py b/custom_extensions/nms/nms.py
new file mode 100644
index 0000000..cdfb078
--- /dev/null
+++ b/custom_extensions/nms/nms.py
@@ -0,0 +1,75 @@
+"""
+adopted from pytorch framework, torchvision.ops.boxes
+
+"""
+
+import torch
+import nms_extension
+
+def nms(boxes, scores, iou_threshold):
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than iou_threshold with another (higher scoring)
+    box.
+
+    Parameters
+    ----------
+    boxes : Tensor[N, 4] for 2D or Tensor[N,6] for 3D.
+        boxes to perform NMS on. They
+        are expected to be in (y1, x1, y2, x2(, z1, z2)) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    iou_threshold : float
+        discards all overlapping
+        boxes with IoU < iou_threshold
+
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices
+        of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    return nms_extension.nms(boxes, scores, iou_threshold)
+
+
+def batched_nms(boxes, scores, idxs, iou_threshold):
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Parameters
+    ----------
+    boxes : Tensor[N, 4] for 2D or Tensor[N,6] for 3D.
+        boxes to perform NMS on. They
+        are expected to be in (y1, x1, y2, x2(, z1, z2)) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    idxs : Tensor[N]
+        indices of the categories for each one of the boxes.
+    iou_threshold : float
+        discards all overlapping boxes
+        with IoU < iou_threshold
+
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices of
+        the elements that have been kept by NMS, sorted
+        in decreasing order of scores
+    """
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    # strategy: in order to perform NMS independently per class.
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    max_coordinate = boxes.max()
+    offsets = idxs.to(boxes) * (max_coordinate + 1)
+    boxes_for_nms = boxes + offsets[:, None]
+    return  nms(boxes_for_nms, scores, iou_threshold)
diff --git a/custom_extensions/nms/setup.py b/custom_extensions/nms/setup.py
new file mode 100644
index 0000000..90a5d13
--- /dev/null
+++ b/custom_extensions/nms/setup.py
@@ -0,0 +1,22 @@
+"""
+Created at 07.11.19 19:12
+@author: gregor
+
+"""
+
+import os, sys, site
+from pathlib import Path
+
+# recognise newly installed packages in path
+site.main()
+
+from setuptools import setup
+from torch.utils import cpp_extension
+
+dir_ = Path(os.path.dirname(sys.argv[0]))
+
+setup(name='nms_extension',
+      ext_modules=[cpp_extension.CUDAExtension('nms_extension', [str(dir_/'src/nms_interface.cpp'), str(dir_/'src/nms.cu')])],
+      cmdclass={'build_ext': cpp_extension.BuildExtension}
+      )
+
diff --git a/custom_extensions/nms/src/cuda_helpers.h b/custom_extensions/nms/src/cuda_helpers.h
new file mode 100644
index 0000000..af32f60
--- /dev/null
+++ b/custom_extensions/nms/src/cuda_helpers.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                \
+  for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
+       i += (blockDim.x * gridDim.x))
diff --git a/custom_extensions/nms/src/nms.cu b/custom_extensions/nms/src/nms.cu
new file mode 100644
index 0000000..39c2873
--- /dev/null
+++ b/custom_extensions/nms/src/nms.cu
@@ -0,0 +1,220 @@
+/*
+NMS implementation in CUDA from pytorch framework
+(https://github.com/pytorch/vision/tree/master/torchvision/csrc/cuda on Nov 13 2019)
+
+Adapted for additional 3D capability by G. Ramien, DKFZ Heidelberg
+*/
+
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#include "cuda_helpers.h"
+
+#include <iostream>
+#include <vector>
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+template <typename T>
+__device__ inline float devIoU(T const* const a, T const* const b) {
+  // a, b hold box coords as (y1, x1, y2, x2) with y1 < y2 etc.
+  T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
+  T left = max(a[1], b[1]), right = min(a[3], b[3]);
+  T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
+  T interS = width * height;
+
+  T Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  T Sb = (b[2] - b[0]) * (b[3] - b[1]);
+
+  return interS / (Sa + Sb - interS);
+}
+
+template <typename T>
+__device__ inline float devIoU_3d(T const* const a, T const* const b) {
+  // a, b hold box coords as (y1, x1, y2, x2, z1, z2) with y1 < y2 etc.
+  // get coordinates of intersection, calc intersection
+  T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
+  T left = max(a[1], b[1]), right = min(a[3], b[3]);
+  T front = max(a[4], b[4]), back = min(a[5], b[5]);
+  T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
+  T depth = max(back - front, (T)0);
+  T interS = width * height * depth;
+  // calc separate boxes volumes
+  T Sa = (a[2] - a[0]) * (a[3] - a[1]) * (a[5] - a[4]);
+  T Sb = (b[2] - b[0]) * (b[3] - b[1]) * (b[5] - b[4]);
+
+  return interS / (Sa + Sb - interS);
+}
+
+
+template <typename T>
+__global__ void nms_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ T block_boxes[threadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU<T>(cur_box, block_boxes + i * 4) > iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+
+template <typename T>
+__global__ void nms_kernel_3d(const int n_boxes, const float iou_threshold, const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ T block_boxes[threadsPerBlock * 6];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 6 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+    block_boxes[threadIdx.x * 6 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+    block_boxes[threadIdx.x * 6 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+    block_boxes[threadIdx.x * 6 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+    block_boxes[threadIdx.x * 6 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+    block_boxes[threadIdx.x * 6 + 5] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 6;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU_3d<T>(cur_box, block_boxes + i * 6) > iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+
+at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold) {
+  /* dets expected as (n_dets, dim) where dim=4 in 2D, dim=6 in 3D */
+  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  bool is_3d = dets.size(1) == 6;
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+
+  if (is_3d) {
+  //std::cout << "performing NMS on 3D boxes in CUDA" << std::endl;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.type(), "nms_kernel_cuda", [&] {
+        nms_kernel_3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+   }
+   else {
+   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.type(), "nms_kernel_cuda", [&] {
+        nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+
+   }
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
\ No newline at end of file
diff --git a/custom_extensions/nms/src/nms_interface.cpp b/custom_extensions/nms/src/nms_interface.cpp
new file mode 100644
index 0000000..e04f65d
--- /dev/null
+++ b/custom_extensions/nms/src/nms_interface.cpp
@@ -0,0 +1,32 @@
+/*  adopted from
+    https://github.com/pytorch/vision/blob/master/torchvision/csrc/nms.h on Nov 15 2019
+    no cpu support, but could be added with this interface.
+*/
+#include <torch/extension.h>
+
+
+//#include "cpu/vision_cpu.h"
+
+at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold);
+
+at::Tensor nms(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  if (dets.device().is_cuda()) {
+
+    if (dets.numel() == 0) {
+      //at::cuda::CUDAGuard device_guard(dets.device());
+      return at::empty({0}, dets.options().dtype(at::kLong));
+    }
+    return nms_cuda(dets, scores, iou_threshold);
+
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //at::Tensor result = nms_cpu(dets, scores, iou_threshold);
+  //return result;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("nms", &nms, "NMS C++ and/or CUDA");
+}
\ No newline at end of file
diff --git a/custom_extensions/roi_align/roi_align.py b/custom_extensions/roi_align/roi_align.py
new file mode 100644
index 0000000..817e871
--- /dev/null
+++ b/custom_extensions/roi_align/roi_align.py
@@ -0,0 +1,128 @@
+"""
+ROIAlign implementation from pytorch framework
+(https://github.com/pytorch/vision/blob/master/torchvision/ops/roi_align.py on Nov 14 2019)
+
+adapted for 3D support without additional python function interface (only cpp function interface).
+"""
+
+import torch
+from torch import nn
+
+from torchvision.ops._utils import convert_boxes_to_roi_format
+
+import roi_al_extension
+import roi_al_extension_3d
+
+def roi_align_2d(input: torch.Tensor, boxes, output_size,
+                 spatial_scale: float = 1.0, sampling_ratio: int =-1) -> torch.Tensor:
+    """
+    Performs Region of Interest (RoI) Align operator described in Mask R-CNN
+
+    Arguments:
+        input: (Tensor[N, C, H, W]), input tensor
+        boxes: (Tensor[K, 5] or List[Tensor[L, 4]]), the box coordinates in (x1, y1, x2, y2)
+            format where the regions will be taken from. If a single Tensor is passed,
+            then the first column should contain the batch index. If a list of Tensors
+            is passed, then each Tensor will correspond to the boxes for an element i
+            in a batch
+        output_size: (int or Tuple[int, int]) the size of the output after the cropping
+            is performed, as (height, width)
+        spatial_scale: (float) a scaling factor that maps the input coordinates to
+            the box coordinates. Default: 1.0
+        sampling_ratio: (int) number of sampling points in the interpolation grid
+            used to compute the output value of each pooled output bin. If > 0,
+            then exactly sampling_ratio x sampling_ratio grid points are used. If
+            <= 0, then an adaptive number of grid points are used (computed as
+            ceil(roi_width / pooled_w), and likewise for height). Default: -1
+
+    Returns:
+        output (Tensor[K, C, output_size[0], output_size[1]])
+    """
+    rois = boxes
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    return roi_al_extension.roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio)
+
+
+def roi_align_3d(input: torch.Tensor, boxes, output_size,
+                 spatial_scale: float = 1.0, sampling_ratio: int = -1) -> torch.Tensor:
+    """
+    Performs Region of Interest (RoI) Align operator described in Mask R-CNN for 3-dim input.
+
+    Arguments:
+        input (Tensor[N, C, H, W, D]): input tensor
+        boxes (Tensor[K, 7] or List[Tensor[L, 6]]): the box coordinates in (x1, y1, x2, y2, z1 ,z2)
+            format where the regions will be taken from. If a single Tensor is passed,
+            then the first column should contain the batch index. If a list of Tensors
+            is passed, then each Tensor will correspond to the boxes for an element i
+            in a batch
+        output_size (int or Tuple[int, int, int]): the size of the output after the cropping
+            is performed, as (height, width, depth)
+        spatial_scale (float): a scaling factor that maps the input coordinates to
+            the box coordinates. Default: 1.0
+        sampling_ratio (int): number of sampling points in the interpolation grid
+            used to compute the output value of each pooled output bin. If > 0,
+            then exactly sampling_ratio x sampling_ratio grid points are used. If
+            <= 0, then an adaptive number of grid points are used (computed as
+            ceil(roi_width / pooled_w), and likewise for height). Default: -1
+
+    Returns:
+        output (Tensor[K, C, output_size[0], output_size[1], output_size[2]])
+    """
+    rois = boxes
+    if not isinstance(rois, torch.Tensor):
+        rois = convert_boxes_to_roi_format(rois)
+    return roi_al_extension_3d.roi_align(input, rois, spatial_scale, output_size[0], output_size[1], output_size[2],
+                                         sampling_ratio)
+
+
+class RoIAlign(nn.Module):
+    """
+    Performs Region of Interest (RoI) Align operator described in Mask R-CNN for 2- or 3-dim input.
+
+    Arguments:
+        input (Tensor[N, C, H, W(, D)]): input tensor
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]) or (Tensor[K, 7] or List[Tensor[L, 6]]):
+            the box coordinates in (x1, y1, x2, y2(, z1 ,z2))
+            format where the regions will be taken from. If a single Tensor is passed,
+            then the first column should contain the batch index. If a list of Tensors
+            is passed, then each Tensor will correspond to the boxes for an element i
+            in a batch
+        output_size (int or Tuple[int, int(, int)]): the size of the output after the cropping
+            is performed, as (height, width(, depth))
+        spatial_scale (float): a scaling factor that maps the input coordinates to
+            the box coordinates. Default: 1.0
+        sampling_ratio (int): number of sampling points in the interpolation grid
+            used to compute the output value of each pooled output bin. If > 0,
+            then exactly sampling_ratio x sampling_ratio grid points are used. If
+            <= 0, then an adaptive number of grid points are used (computed as
+            ceil(roi_width / pooled_w), and likewise for height (and depth)). Default: -1
+
+    Returns:
+        output (Tensor[K, C, output_size[0], output_size[1](, output_size[2])])
+    """
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        super(RoIAlign, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.dim = len(self.output_size)
+
+        if self.dim == 2:
+            self.roi_align = roi_align_2d
+        elif self.dim == 3:
+            self.roi_align = roi_align_3d
+        else:
+            raise Exception("Tried to init RoIAlign module with incorrect output size: {}".format(self.output_size))
+
+    def forward(self, input: torch.Tensor, rois) -> torch.Tensor:
+        return self.roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'output_size=' + str(self.output_size)
+        tmpstr += ', spatial_scale=' + str(self.spatial_scale)
+        tmpstr += ', sampling_ratio=' + str(self.sampling_ratio)
+        tmpstr += ', dimension=' + str(self.dim)
+        tmpstr += ')'
+        return tmpstr
diff --git a/custom_extensions/roi_align/setup.py b/custom_extensions/roi_align/setup.py
new file mode 100644
index 0000000..ebe50f4
--- /dev/null
+++ b/custom_extensions/roi_align/setup.py
@@ -0,0 +1,28 @@
+"""
+Created at 07.11.19 19:12
+@author: gregor
+
+"""
+
+import os, sys, site
+from pathlib import Path
+
+# recognise newly installed packages in path
+site.main()
+
+from setuptools import setup
+from torch.utils import cpp_extension
+
+dir_ = Path(os.path.dirname(sys.argv[0]))
+
+setup(name='RoIAlign extension 2D',
+      ext_modules=[cpp_extension.CUDAExtension('roi_al_extension', [str(dir_/'src/RoIAlign_interface.cpp'),
+                                                                    str(dir_/'src/RoIAlign_cuda.cu')])],
+      cmdclass={'build_ext': cpp_extension.BuildExtension}
+      )
+
+setup(name='RoIAlign extension 3D',
+      ext_modules=[cpp_extension.CUDAExtension('roi_al_extension_3d', [str(dir_/'src/RoIAlign_interface_3d.cpp'),
+                                                                       str(dir_/'src/RoIAlign_cuda_3d.cu')])],
+      cmdclass={'build_ext': cpp_extension.BuildExtension}
+      )
\ No newline at end of file
diff --git a/custom_extensions/roi_align/src/RoIAlign_cuda.cu b/custom_extensions/roi_align/src/RoIAlign_cuda.cu
new file mode 100644
index 0000000..b31ba44
--- /dev/null
+++ b/custom_extensions/roi_align/src/RoIAlign_cuda.cu
@@ -0,0 +1,429 @@
+/*
+ROIAlign implementation in CUDA from pytorch framework
+(https://github.com/pytorch/vision/tree/master/torchvision/csrc/cuda on Nov 14 2019)
+
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#include "cuda_helpers.h"
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale;
+    T roi_start_h = offset_rois[2] * spatial_scale;
+    T roi_end_w = offset_rois[3] * spatial_scale;
+    T roi_end_h = offset_rois[4] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    //printf("roi height %f, width %f\n", (float) roi_height, (float) roi_width);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    output[index] = output_val;
+  }
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackward(
+    const int nthreads,
+    const T* grad_output,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale;
+    T roi_start_h = offset_rois[2] * spatial_scale;
+    T roi_end_w = offset_rois[3] * spatial_scale;
+    T roi_end_h = offset_rois[4] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // We need to index the gradient using the tensor strides to access the
+    // correct values.
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale,
+                                const int pooled_height, const int pooled_width, const int sampling_ratio) {
+  /*
+   input: feature-map tensor, shape (batch, n_channels, y, x(, z))
+   */
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "ROIAlign_forward", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+        output_size,
+        input.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        rois.contiguous().data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>());
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "ROIAlign_backward", [&] {
+    RoIAlignBackward<scalar_t><<<grid, block, 0, stream>>>(
+        grad.numel(),
+        grad.data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        grad_input.data_ptr<scalar_t>(),
+        rois.contiguous().data_ptr<scalar_t>(),
+        n_stride,
+        c_stride,
+        h_stride,
+        w_stride);
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
\ No newline at end of file
diff --git a/custom_extensions/roi_align/src/RoIAlign_cuda_3d.cu b/custom_extensions/roi_align/src/RoIAlign_cuda_3d.cu
new file mode 100644
index 0000000..a8022bb
--- /dev/null
+++ b/custom_extensions/roi_align/src/RoIAlign_cuda_3d.cu
@@ -0,0 +1,488 @@
+/*
+ROIAlign implementation in CUDA from pytorch framework
+(https://github.com/pytorch/vision/tree/master/torchvision/csrc/cuda on Nov 14 2019)
+
+Adapted for additional 3D capability by G. Ramien, DKFZ Heidelberg
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <cstdio>
+#include "cuda_helpers.h"
+
+/*-------------- gpu kernels -----------------*/
+
+template <typename T>
+__device__ T linear_interpolate(const T xl, const T val_low, const T val_high){
+
+  T val = (val_high - val_low) * xl + val_low;
+  return val;
+}
+
+template <typename T>
+__device__ T trilinear_interpolate(const T* input, const int height, const int width, const int depth,
+                T y, T x, T z, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width || z < -1.0 || z > depth) {
+    // empty
+    return 0;
+  }
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+  if (z <= 0)
+    z = 0;
+
+  int y0 = (int)y;
+  int x0 = (int)x;
+  int z0 = (int)z;
+  int y1;
+  int x1;
+  int z1;
+
+  if (y0 >= height - 1) {
+  /*if nearest gridpoint to y on the lower end is on border or border-1, set low, high, mid(=actual point) to border-1*/
+    y1 = y0 = height - 1;
+    y = (T)y0;
+  } else {
+    /* y1 is one pixel from y0, y is the actual point somewhere in between  */
+    y1 = y0 + 1;
+  }
+  if (x0 >= width - 1) {
+    x1 = x0 = width - 1;
+    x = (T)x0;
+  } else {
+    x1 = x0 + 1;
+  }
+  if (z0 >= depth - 1) {
+    z1 = z0 = depth - 1;
+    z = (T)z0;
+  } else {
+    z1 = z0 + 1;
+  }
+
+
+  // do linear interpolation of x values
+  // distance of actual point to lower boundary point, already normalized since x_high - x0 = 1
+  T dis = x - x0;
+  /*  accessing element b,c,y,x,z in 1D-rolled-out array of a tensor with dimensions (B, C, Y, X, Z):
+      tensor[b,c,y,x,z] = arr[ (((b*C+c)*Y+y)*X + x)*Z + z ] = arr[ alpha + (y*X + x)*Z + z ]
+      with alpha = batch&channel locator = (b*C+c)*YXZ.
+      hence, as current input pointer is already offset by alpha: y,x,z at input[( y*X + x)*Z + z], where
+      X = width, Z = depth.
+  */
+  T x00 = linear_interpolate(dis, input[(y0*width+ x0)*depth+z0], input[(y0*width+ x1)*depth+z0]);
+  T x10 = linear_interpolate(dis, input[(y1*width+ x0)*depth+z0], input[(y1*width+ x1)*depth+z0]);
+  T x01 = linear_interpolate(dis, input[(y0*width+ x0)*depth+z1], input[(y0*width+ x1)*depth+z1]);
+  T x11 = linear_interpolate(dis, input[(y1*width+ x0)*depth+z1], input[(y1*width+ x1)*depth+z1]);
+
+  // linear interpol of y values = bilinear interpol of f(x,y)
+  dis = y - y0;
+  T xy0 = linear_interpolate(dis, x00, x10);
+  T xy1 = linear_interpolate(dis, x01, x11);
+
+  // linear interpol of z value = trilinear interpol of f(x,y,z)
+  dis = z - z0;
+  T xyz = linear_interpolate(dis, xy0, xy1);
+
+  return xyz;
+}
+
+template <typename T>
+__device__ void trilinear_interpolate_gradient(const int height, const int width, const int depth, T y, T x, T z,
+    T& g000, T& g001, T& g010, T& g100, T& g011, T& g101, T& g110, T& g111,
+    int& x0, int& x1, int& y0, int& y1, int& z0, int&z1, const int index /* index for debug only*/)
+{
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width || z < -1.0 || z > depth) {
+    // empty
+    g000 = g001 = g010 = g100 = g011 = g101 = g110 = g111 = 0.;
+    x0 = x1 = y0 = y1 = z0 = z1 = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+  if (z <= 0)
+    z = 0;
+
+  y0 = (int)y;
+  x0 = (int)x;
+  z0 = (int)z;
+
+  if (y0 >= height - 1) {
+    y1 = y0 = height - 1;
+    y = (T)y0;
+  } else {
+    y1 = y0 + 1;
+  }
+
+  if (x0 >= width - 1) {
+    x1 = x0 = width - 1;
+    x = (T)x0;
+  } else {
+    x1 = x0 + 1;
+  }
+
+  if (z0 >= depth - 1) {
+    z1 = z0 = depth - 1;
+    z = (T)z0;
+  } else {
+    z1 = z0 + 1;
+  }
+
+  // forward calculations are added as hints
+  T dis_x = x - x0;
+  //T x00 = linear_interpolate(dis, input[(y0*width+ x0)*depth+z0], input[(y0*width+ x1)*depth+z0]); // v000, v100
+  //T x10 = linear_interpolate(dis, input[(y1*width+ x0)*depth+z0], input[(y1*width+ x1)*depth+z0]); // v010, v110
+  //T x01 = linear_interpolate(dis, input[(y0*width+ x0)*depth+z1], input[(y0*width+ x1)*depth+z1]); // v001, v101
+  //T x11 = linear_interpolate(dis, input[(y1*width+ x0)*depth+z1], input[(y1*width+ x1)*depth+z1]); // v011, v111
+
+  // linear interpol of y values = bilinear interpol of f(x,y)
+  T dis_y = y - y0;
+  //T xy0 = linear_interpolate(dis, x00, x10);
+  //T xy1 = linear_interpolate(dis, x01, x11);
+
+  // linear interpol of z value = trilinear interpol of f(x,y,z)
+  T dis_z = z - z0;
+  //T xyz = linear_interpolate(dis, xy0, xy1);
+
+  /* need: grad_i := d(xyz)/d(v_i) with v_i = input_value_i  for all i = 0,..,7 (eight input values --> eight-entry gradient)
+     d(lin_interp(dis,x,y))/dx = (-dis +1) and d(lin_interp(dis,x,y))/dy = dis --> derivatives are indep of x,y.
+     notation: gxyz = gradient for d(trilin_interp)/d(input_value_at_xyz)
+     below grads were calculated by hand
+     save time by reusing (1-dis_x) = 1-x+x0 = x1-x =: dis_x1 */
+  T dis_x1 = (1-dis_x), dis_y1 = (1-dis_y), dis_z1 = (1-dis_z);
+
+  g000 = dis_z1 * dis_y1  * dis_x1;
+  g001 = dis_z  * dis_y1  * dis_x1;
+  g010 = dis_z1 * dis_y   * dis_x1;
+  g100 = dis_z1 * dis_y1  * dis_x;
+  g011 = dis_z  * dis_y   * dis_x1;
+  g101 = dis_z  * dis_y1  * dis_x;
+  g110 = dis_z1 * dis_y   * dis_x;
+  g111 = dis_z  * dis_y   * dis_x;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(const int nthreads, const T* input, const T spatial_scale, const int channels,
+    const int height, const int width, const int depth, const int pooled_height, const int pooled_width,
+    const int pooled_depth, const int sampling_ratio, const T* rois, T* output)
+{
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw, pd) is an element in the pooled output
+    int pd =  index % pooled_depth;
+    int pw = (index / pooled_depth) % pooled_width;
+    int ph = (index / pooled_depth / pooled_width) % pooled_height;
+    int c  = (index / pooled_depth / pooled_width / pooled_height) % channels;
+    int n  =  index / pooled_depth / pooled_width / pooled_height / channels;
+
+
+    // rois are (y1,x1,y2,x2,z1,z2) --> tensor of shape (n_rois, 6)
+    const T* offset_rois = rois + n * 7;
+    int roi_batch_ind = offset_rois[0];
+    // Do not use rounding; this implementation detail is critical
+    T roi_start_h = offset_rois[1] * spatial_scale;
+    T roi_start_w = offset_rois[2] * spatial_scale;
+    T roi_end_h = offset_rois[3] * spatial_scale;
+    T roi_end_w = offset_rois[4] * spatial_scale;
+    T roi_start_d = offset_rois[5] * spatial_scale;
+    T roi_end_d = offset_rois[6] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_depth = max(roi_end_d - roi_start_d, (T)1.);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    T bin_size_d = static_cast<T>(roi_depth) / static_cast<T>(pooled_depth);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width * depth;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    // roi_bin_grid == nr of sampling points per bin >= 1
+    int roi_bin_grid_h =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    int roi_bin_grid_d =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_depth / pooled_depth);
+
+    // We do average (integral) pooling inside a bin
+    const T n_voxels = roi_bin_grid_h * roi_bin_grid_w * roi_bin_grid_d; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5, always in the middle of two grid pointsk
+
+      for (int ix = 0; ix < roi_bin_grid_w; ix++)
+      {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+        for (int iz = 0; iz < roi_bin_grid_d; iz++)
+        {
+          const T z = roi_start_d + pd * bin_size_d +
+              static_cast<T>(iz + .5f) * bin_size_d / static_cast<T>(roi_bin_grid_d);
+          // TODO verify trilinear interpolation
+          T val = trilinear_interpolate(offset_input, height, width, depth, y, x, z, index);
+          output_val += val;
+        } // z iterator and calc+add value
+      } // x iterator
+    } // y iterator
+    output_val /= n_voxels;
+
+    output[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignBackward(const int nthreads, const T* grad_output, const T spatial_scale, const int channels,
+    const int height, const int width, const int depth, const int pooled_height, const int pooled_width,
+    const int pooled_depth, const int sampling_ratio, T* grad_input, const T* rois,
+    const int n_stride, const int c_stride, const int h_stride, const int w_stride, const int d_stride)
+{
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw, pd) is an element in the pooled output
+    int pd =  index % pooled_depth;
+    int pw = (index / pooled_depth) % pooled_width;
+    int ph = (index / pooled_depth / pooled_width) % pooled_height;
+    int c  = (index / pooled_depth / pooled_width / pooled_height) % channels;
+    int n  =  index / pooled_depth / pooled_width / pooled_height / channels;
+
+
+    const T* offset_rois = rois + n * 7;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale;
+    T roi_start_h = offset_rois[2] * spatial_scale;
+    T roi_end_w = offset_rois[3] * spatial_scale;
+    T roi_end_h = offset_rois[4] * spatial_scale;
+    T roi_start_d = offset_rois[5] * spatial_scale;
+    T roi_end_d = offset_rois[6] * spatial_scale;
+
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T roi_depth = max(roi_end_d - roi_start_d, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    T bin_size_d = static_cast<T>(roi_depth) / static_cast<T>(pooled_depth);
+
+    // offset: index b,c,y,x,z of tensor of shape (B,C,Y,X,Z) is
+    // b*C*Y*X*Z + c * Y*X*Z + y * X*Z + x *Z + z = (b*C+c)Y*X*Z + ...
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width * depth);
+
+    // We need to index the gradient using the tensor strides to access the correct values.
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride + pd * d_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    int roi_bin_grid_d = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_depth / pooled_depth);
+
+    // We do average (integral) pooling inside a bin
+    const T n_voxels = roi_bin_grid_h * roi_bin_grid_w * roi_bin_grid_d; // e.g. = 6
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+
+      for (int ix = 0; ix < roi_bin_grid_w; ix++)
+      {
+        const T x = roi_start_w + pw * bin_size_w +
+          static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+        for (int iz = 0; iz < roi_bin_grid_d; iz++)
+        {
+          const T z = roi_start_d + pd * bin_size_d +
+              static_cast<T>(iz + .5f) * bin_size_d / static_cast<T>(roi_bin_grid_d);
+
+          T g000, g001, g010, g100, g011, g101, g110, g111; // will hold the current partial derivatives
+          int x0, x1, y0, y1, z0, z1;
+          /* notation: gxyz = gradient at xyz, where x,y,z need to lie on feature-map grid (i.e., =x0,x1 etc.) */
+          trilinear_interpolate_gradient(height, width, depth, y, x, z,
+                                         g000, g001, g010, g100, g011, g101, g110, g111,
+                                         x0, x1, y0, y1, z0, z1, index);
+          /* chain rule: derivatives (i.e., the gradient) of trilin_interpolate(v1,v2,v3,v4,...) (div by n_voxels
+             as we actually need gradient of whole roi_align) are multiplied with gradient so far*/
+          g000 *= grad_output_this_bin / n_voxels;
+          g001 *= grad_output_this_bin / n_voxels;
+          g010 *= grad_output_this_bin / n_voxels;
+          g100 *= grad_output_this_bin / n_voxels;
+          g011 *= grad_output_this_bin / n_voxels;
+          g101 *= grad_output_this_bin / n_voxels;
+          g110 *= grad_output_this_bin / n_voxels;
+          g111 *= grad_output_this_bin / n_voxels;
+
+          if (x0 >= 0 && x1 >= 0 && y0 >= 0 && y1 >= 0 && z0 >= 0 && z1 >= 0)
+          { // atomicAdd(address, content) reads content under address, adds content to it, while: no other thread
+            // can interfere with the memory at address during this operation (thread lock, therefore "atomic").
+            atomicAdd(offset_grad_input + (y0 * width + x0) * depth + z0, static_cast<T>(g000));
+            atomicAdd(offset_grad_input + (y0 * width + x0) * depth + z1, static_cast<T>(g001));
+            atomicAdd(offset_grad_input + (y1 * width + x0) * depth + z0, static_cast<T>(g010));
+            atomicAdd(offset_grad_input + (y0 * width + x1) * depth + z0, static_cast<T>(g100));
+            atomicAdd(offset_grad_input + (y1 * width + x0) * depth + z1, static_cast<T>(g011));
+            atomicAdd(offset_grad_input + (y0 * width + x1) * depth + z1, static_cast<T>(g101));
+            atomicAdd(offset_grad_input + (y1 * width + x1) * depth + z0, static_cast<T>(g110));
+            atomicAdd(offset_grad_input + (y1 * width + x1) * depth + z1, static_cast<T>(g111));
+          } // if
+        } // iz
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+
+/*----------- wrapper functions ----------------*/
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale,
+                                const int pooled_height, const int pooled_width, const int pooled_depth, const int sampling_ratio) {
+  /*
+   input: feature-map tensor, shape (batch, n_channels, y, x(, z))
+   */
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto depth = input.size(4);
+  //std::cout << "input.options" << input.options() << std::endl;
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width, pooled_depth}, input.options());
+
+  auto output_size = num_rois * channels * pooled_height * pooled_width * pooled_depth;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  //std::printf("launching kernel\n");
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "ROIAlign forward in 3d", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+        output_size,
+        input.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        depth,
+        pooled_height,
+        pooled_width,
+        pooled_depth,
+        sampling_ratio,
+        rois.contiguous().data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>());
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int pooled_depth,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int depth,
+    const int sampling_ratio)
+{
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width, depth}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+  int d_stride = grad.stride(4);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "ROIAlign backward 3D", [&] {
+    RoIAlignBackward<scalar_t><<<grid, block, 0, stream>>>(
+        grad.numel(),
+        grad.data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        depth,
+        pooled_height,
+        pooled_width,
+        pooled_depth,
+        sampling_ratio,
+        grad_input.data_ptr<scalar_t>(),
+        rois.contiguous().data_ptr<scalar_t>(),
+        n_stride,
+        c_stride,
+        h_stride,
+        w_stride,
+        d_stride);
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
\ No newline at end of file
diff --git a/custom_extensions/roi_align/src/RoIAlign_interface.cpp b/custom_extensions/roi_align/src/RoIAlign_interface.cpp
new file mode 100644
index 0000000..41d5fdf
--- /dev/null
+++ b/custom_extensions/roi_align/src/RoIAlign_interface.cpp
@@ -0,0 +1,104 @@
+/* adopted from pytorch framework
+    https://github.com/pytorch/vision/blob/master/torchvision/csrc/ROIAlign.h on Nov 15 2019.
+
+    does not include CPU support but could be added with this interface.
+*/
+
+#include <torch/extension.h>
+
+// Declarations that are initialized in cuda file
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale,
+                                const int pooled_height, const int pooled_width, const int sampling_ratio);
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
+    const int pooled_height, const int pooled_width, const int batch_size, const int channels,
+    const int height, const int width, const int sampling_ratio);
+
+// Interface for Python
+at::Tensor ROIAlign_forward(
+    const at::Tensor& input, // Input feature map.
+    const at::Tensor& rois, // List of ROIs to pool over.
+    const double spatial_scale, // The scale of the image features. ROIs will be  scaled to this.
+    const int64_t pooled_height, // The height of the pooled feature map.
+    const int64_t pooled_width, // The width of the pooled feature
+    const int64_t sampling_ratio) // The number of points to sample in each bin along each axis.
+{
+  if (input.type().is_cuda()) {
+    return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+at::Tensor ROIAlign_backward(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
+    const int pooled_height, const int pooled_width, const int batch_size, const int channels,
+    const int height, const int width, const int sampling_ratio) {
+  if (grad.type().is_cuda()) {
+    return ROIAlign_backward_cuda( grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels,
+        height, width, sampling_ratio);
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //return ROIAlign_backward_cpu(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels,
+  //   height, width, sampling_ratio);
+}
+
+using namespace at;
+using torch::Tensor;
+using torch::autograd::AutogradContext;
+using torch::autograd::Variable;
+using torch::autograd::variable_list;
+
+class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
+ public:
+  static variable_list forward(
+      AutogradContext* ctx,
+      Variable input,
+      Variable rois,
+      const double spatial_scale,
+      const int64_t pooled_height,
+      const int64_t pooled_width,
+      const int64_t sampling_ratio) {
+    ctx->saved_data["spatial_scale"] = spatial_scale;
+    ctx->saved_data["pooled_height"] = pooled_height;
+    ctx->saved_data["pooled_width"] = pooled_width;
+    ctx->saved_data["sampling_ratio"] = sampling_ratio;
+    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->save_for_backward({rois});
+    auto result = ROIAlign_forward(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+    return {result};
+  }
+
+  static variable_list backward(
+      AutogradContext* ctx,
+      variable_list grad_output) {
+    // Use data saved in forward
+    auto saved = ctx->get_saved_variables();
+    auto rois = saved[0];
+    auto input_shape = ctx->saved_data["input_shape"].toIntList();
+    auto grad_in = ROIAlign_backward(
+        grad_output[0],
+        rois,
+        ctx->saved_data["spatial_scale"].toDouble(),
+        ctx->saved_data["pooled_height"].toInt(),
+        ctx->saved_data["pooled_width"].toInt(),
+        input_shape[0], //b
+        input_shape[1], //c
+        input_shape[2], //h
+        input_shape[3], //w
+        ctx->saved_data["sampling_ratio"].toInt());
+    return {
+        grad_in, Variable(), Variable(), Variable(), Variable(), Variable()};
+  }
+};
+
+Tensor roi_align(const Tensor& input, const Tensor& rois, const double spatial_scale, const int64_t pooled_height,
+    const int64_t pooled_width, const int64_t sampling_ratio) {
+
+  return ROIAlignFunction::apply(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio)[0];
+
+}
+
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("roi_align", &roi_align, "ROIAlign 2D in c++ and/or cuda");
+}
\ No newline at end of file
diff --git a/custom_extensions/roi_align/src/RoIAlign_interface_3d.cpp b/custom_extensions/roi_align/src/RoIAlign_interface_3d.cpp
new file mode 100644
index 0000000..bb680fa
--- /dev/null
+++ b/custom_extensions/roi_align/src/RoIAlign_interface_3d.cpp
@@ -0,0 +1,112 @@
+/* adopted from pytorch framework
+    https://github.com/pytorch/vision/blob/master/torchvision/csrc/ROIAlign.h on Nov 15 2019.
+
+    does not include CPU support but could be added with this interface.
+*/
+
+#include <torch/extension.h>
+
+/*---------------- 3D implementation ---------------------------*/
+
+// Declarations that are initialized in cuda file
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale,
+                                const int pooled_height, const int pooled_width, const int pooled_depth,
+                                const int sampling_ratio);
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
+    const int pooled_height, const int pooled_width, const int pooled_depth, const int batch_size, const int channels,
+    const int height, const int width, const int depth, const int sampling_ratio);
+
+// Interface for Python
+at::Tensor ROIAlign_forward(
+    const at::Tensor& input, // Input feature map.
+    const at::Tensor& rois, // List of ROIs to pool over.
+    const double spatial_scale, // The scale of the image features. ROIs will be  scaled to this.
+    const int64_t pooled_height, // The height of the pooled feature map.
+    const int64_t pooled_width, // The width of the pooled feature
+    const int64_t pooled_depth,
+    const int64_t sampling_ratio) // The number of points to sample in each bin along each axis.
+{
+  if (input.type().is_cuda()) {
+    return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, pooled_depth, sampling_ratio);
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+at::Tensor ROIAlign_backward(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
+    const int pooled_height, const int pooled_width, const int pooled_depth, const int batch_size, const int channels,
+    const int height, const int width, const int depth, const int sampling_ratio) {
+  if (grad.type().is_cuda()) {
+    return ROIAlign_backward_cuda( grad, rois, spatial_scale, pooled_height, pooled_width, pooled_depth, batch_size,
+     channels, height, width, depth, sampling_ratio);
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //return ROIAlign_backward_cpu(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels,
+  //   height, width, sampling_ratio);
+}
+
+using namespace at;
+using torch::Tensor;
+using torch::autograd::AutogradContext;
+using torch::autograd::Variable;
+using torch::autograd::variable_list;
+
+class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
+ public:
+  static variable_list forward(
+      AutogradContext* ctx,
+      Variable input,
+      Variable rois,
+      const double spatial_scale,
+      const int64_t pooled_height,
+      const int64_t pooled_width,
+      const int64_t pooled_depth,
+      const int64_t sampling_ratio) {
+    ctx->saved_data["spatial_scale"] = spatial_scale;
+    ctx->saved_data["pooled_height"] = pooled_height;
+    ctx->saved_data["pooled_width"] = pooled_width;
+    ctx->saved_data["pooled_depth"] = pooled_depth;
+    ctx->saved_data["sampling_ratio"] = sampling_ratio;
+    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->save_for_backward({rois});
+    auto result = ROIAlign_forward(input, rois, spatial_scale, pooled_height, pooled_width, pooled_depth,
+     sampling_ratio);
+    return {result};
+  }
+
+  static variable_list backward(
+      AutogradContext* ctx,
+      variable_list grad_output) {
+    // Use data saved in forward
+    auto saved = ctx->get_saved_variables();
+    auto rois = saved[0];
+    auto input_shape = ctx->saved_data["input_shape"].toIntList();
+    auto grad_in = ROIAlign_backward(
+        grad_output[0],
+        rois,
+        ctx->saved_data["spatial_scale"].toDouble(),
+        ctx->saved_data["pooled_height"].toInt(),
+        ctx->saved_data["pooled_width"].toInt(),
+        ctx->saved_data["pooled_depth"].toInt(),
+        input_shape[0],
+        input_shape[1],
+        input_shape[2],
+        input_shape[3],
+        input_shape[4],
+        ctx->saved_data["sampling_ratio"].toInt());
+    return {
+        grad_in, Variable(), Variable(), Variable(), Variable(), Variable(), Variable()};
+  }
+};
+
+Tensor roi_align(const Tensor& input, const Tensor& rois, const double spatial_scale, const int64_t pooled_height,
+    const int64_t pooled_width, const int64_t pooled_depth, const int64_t sampling_ratio) {
+
+  return ROIAlignFunction::apply(input, rois, spatial_scale, pooled_height, pooled_width, pooled_depth,
+                                    sampling_ratio)[0];
+
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("roi_align", &roi_align, "ROIAlign 3D in c++ and/or cuda");
+}
\ No newline at end of file
diff --git a/custom_extensions/roi_align/src/cuda_helpers.h b/custom_extensions/roi_align/src/cuda_helpers.h
new file mode 100644
index 0000000..af32f60
--- /dev/null
+++ b/custom_extensions/roi_align/src/cuda_helpers.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                \
+  for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
+       i += (blockDim.x * gridDim.x))
diff --git a/experiments/lidc_exp/configs.py b/experiments/lidc_exp/configs.py
index cef1403..f992f90 100644
--- a/experiments/lidc_exp/configs.py
+++ b/experiments/lidc_exp/configs.py
@@ -1,334 +1,334 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 import numpy as np
 from default_configs import DefaultConfigs
 
 class configs(DefaultConfigs):
 
     def __init__(self, server_env=None):
 
         #########################
         #    Preprocessing      #
         #########################
 
         self.root_dir = '/path/to/raw/data'
         self.raw_data_dir = '{}/data_nrrd'.format(self.root_dir)
         self.pp_dir = '{}/pp_norm'.format(self.root_dir)
         self.target_spacing = (0.7, 0.7, 1.25)
 
         #########################
         #         I/O           #
         #########################
 
 
         # one out of [2, 3]. dimension the model operates in.
-        self.dim = 2
+        self.dim = 3
 
-        # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn', 'detection_unet'].
+        # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn'].
         self.model = 'retina_unet'
 
         DefaultConfigs.__init__(self, self.model, server_env, self.dim)
 
         # int [0 < dataset_size]. select n patients from dataset for prototyping. If None, all data is used.
         self.select_prototype_subset = None
 
         # path to preprocessed data.
         self.pp_name = 'lidc_preprocessed_for_G2'
         self.input_df_name = 'info_df.pickle'
         self.pp_data_path = '/mnt/HDD2TB/Documents/data/lidc/{}'.format(self.pp_name)
         self.pp_test_data_path = self.pp_data_path #change if test_data in separate folder.
 
         # settings for deployment in cloud.
         if server_env:
             # path to preprocessed data.
             self.pp_name = 'pp_fg_slices'
             self.crop_name = 'pp_fg_slices_packed'
             self.pp_data_path = '/path/to/preprocessed/data/{}/{}'.format(self.pp_name, self.crop_name)
             self.pp_test_data_path = self.pp_data_path
             self.select_prototype_subset = None
 
         #########################
         #      Data Loader      #
         #########################
 
         # select modalities from preprocessed data
         self.channels = [0]
         self.n_channels = len(self.channels)
 
         # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation.
         self.pre_crop_size_2D = [300, 300]
         self.patch_size_2D = [288, 288]
         self.pre_crop_size_3D = [156, 156, 96]
         self.patch_size_3D = [128, 128, 64]
         self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D
         self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D
 
         # ratio of free sampled batch elements before class balancing is triggered
         # (>0 to include "empty"/background patches.)
         self.batch_sample_slack = 0.2
 
         # set 2D network to operate in 3D images.
         self.merge_2D_to_3D_preds = True
 
         # feed +/- n neighbouring slices into channel dimension. set to None for no context.
         self.n_3D_context = None
         if self.n_3D_context is not None and self.dim == 2:
             self.n_channels *= (self.n_3D_context * 2 + 1)
 
 
         #########################
         #      Architecture      #
         #########################
 
         self.start_filts = 48 if self.dim == 2 else 18
         self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2
         self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50'
         self.norm = None # one of None, 'instance_norm', 'batch_norm'
         self.weight_decay = 0
 
         # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform')
         self.weight_init = None
 
         #########################
         #  Schedule / Selection #
         #########################
 
         self.num_epochs = 100
         self.num_train_batches = 200 if self.dim == 2 else 200
         self.batch_size = 20 if self.dim == 2 else 8
 
         self.do_validation = True
         # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training)
         # the former is morge accurate, while the latter is faster (depending on volume size)
         self.val_mode = 'val_sampling' # one of 'val_sampling' , 'val_patient'
         if self.val_mode == 'val_patient':
             self.max_val_patients = 50  # if 'None' iterates over entire val_set once.
         if self.val_mode == 'val_sampling':
             self.num_val_batches = 50
 
         #########################
         #   Testing / Plotting  #
         #########################
 
         # set the top-n-epochs to be saved for temporal averaging in testing.
         self.save_n_models = 5
         self.test_n_epochs = 5
         # set a minimum epoch number for saving in case of instabilities in the first phase of training.
         self.min_save_thresh = 0 if self.dim == 2 else 0
 
         self.report_score_level = ['patient', 'rois']  # choose list from 'patient', 'rois'
         self.class_dict = {1: 'benign', 2: 'malignant'}  # 0 is background.
         self.patient_class_of_interest = 2  # patient metrics are only plotted for one class.
         self.ap_match_ious = [0.1]  # list of ious to be evaluated for ap-scoring.
 
         self.model_selection_criteria = ['malignant_ap', 'benign_ap'] # criteria to average over for saving epochs.
         self.min_det_thresh = 0.1  # minimum confidence value to select predictions for evaluation.
 
         # threshold for clustering predictions together (wcs = weighted cluster scoring).
         # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold).
         # if too high, preds of the same object are separate clusters.
         self.wcs_iou = 1e-5
 
         self.plot_prediction_histograms = True
         self.plot_stat_curves = False
 
         #########################
         #   Data Augmentation   #
         #########################
 
         self.da_kwargs={
         'do_elastic_deform': True,
         'alpha':(0., 1500.),
         'sigma':(30., 50.),
         'do_rotation':True,
         'angle_x': (0., 2 * np.pi),
         'angle_y': (0., 0),
         'angle_z': (0., 0),
         'do_scale': True,
         'scale':(0.8, 1.1),
         'random_crop':False,
         'rand_crop_dist':  (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3),
         'border_mode_data': 'constant',
         'border_cval_data': 0,
         'order_data': 1
         }
 
         if self.dim == 3:
             self.da_kwargs['do_elastic_deform'] = False
             self.da_kwargs['angle_x'] = (0, 0.0)
             self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!!
             self.da_kwargs['angle_z'] = (0., 2 * np.pi)
 
 
         #########################
         #   Add model specifics #
         #########################
 
         {'detection_unet': self.add_det_unet_configs,
          'mrcnn': self.add_mrcnn_configs,
          'ufrcnn': self.add_mrcnn_configs,
          'retina_net': self.add_mrcnn_configs,
          'retina_unet': self.add_mrcnn_configs,
         }[self.model]()
 
 
     def add_det_unet_configs(self):
 
         self.learning_rate = [1e-4] * self.num_epochs
 
         # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median']
         self.aggregation_operation = 'max'
 
         # max number of roi candidates to identify per batch element and class.
         self.n_roi_candidates = 10 if self.dim == 2 else 30
 
         # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
         self.seg_loss_mode = 'dice_wce'
 
         # if <1, false positive predictions in foreground are penalized less.
         self.fp_dice_weight = 1 if self.dim == 2 else 1
 
         self.wce_weights = [1, 1, 1]
         self.detection_min_confidence = self.min_det_thresh
 
         # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic).
         self.class_specific_seg_flag = True
         self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
         self.head_classes = self.num_seg_classes
 
     def add_mrcnn_configs(self):
 
         # learning rate is a list with one entry per epoch.
         self.learning_rate = [1e-4] * self.num_epochs
 
         # disable the re-sampling of mask proposals to original size for speed-up.
         # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching),
         # mask-outputs are optional.
         self.return_masks_in_val = True
         self.return_masks_in_test = False
 
         # set number of proposal boxes to plot after each epoch.
         self.n_plot_rpn_props = 5 if self.dim == 2 else 30
 
         # number of classes for head networks: n_foreground_classes + 1 (background)
         self.head_classes = 3
 
         # seg_classes hier refers to the first stage classifier (RPN)
         self.num_seg_classes = 2  # foreground vs. background
 
         # feature map strides per pyramid level are inferred from architecture.
         self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]}
 
         # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale
         # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.)
         self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]}
 
         # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3.
         self.pyramid_levels = [0, 1, 2, 3]
 
         # number of feature maps in rpn. typically lowered in 3D to save gpu-memory.
         self.n_rpn_features = 512 if self.dim == 2 else 128
 
         # anchor ratios and strides per position in feature maps.
         self.rpn_anchor_ratios = [0.5, 1, 2]
         self.rpn_anchor_stride = 1
 
         # Threshold for first stage (RPN) non-maximum suppression (NMS):  LOWER == HARDER SELECTION
         self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7
 
         # loss sampling settings.
         self.rpn_train_anchors_per_image = 6  #per batch element
         self.train_rois_per_image = 6 #per batch element
         self.roi_positive_ratio = 0.5
         self.anchor_matching_iou = 0.7
 
         # factor of top-k candidates to draw from  per negative sample (stochastic-hard-example-mining).
         # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples.
         self.shem_poolsize = 10
 
         self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3)
         self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5)
         self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10)
 
         self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1], 0, self.patch_size_3D[2]])
         self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1],
                                self.patch_size_3D[2], self.patch_size_3D[2]])
         if self.dim == 2:
             self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4]
             self.bbox_std_dev = self.bbox_std_dev[:4]
             self.window = self.window[:4]
             self.scale = self.scale[:4]
 
         # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element.
         self.pre_nms_limit = 3000 if self.dim == 2 else 6000
 
         # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True,
         # since proposals of the entire batch are forwarded through second stage in as one "batch".
         self.roi_chunk_size = 2500 if self.dim == 2 else 600
         self.post_nms_rois_training = 500 if self.dim == 2 else 75
         self.post_nms_rois_inference = 500
 
         # Final selection of detections (refine_detections)
         self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30  # per batch element and class.
         self.detection_nms_threshold = 1e-5  # needs to be > 0, otherwise all predictions are one cluster.
         self.model_min_confidence = 0.1
 
         if self.dim == 2:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride))]
                  for stride in self.backbone_strides['xy']])
         else:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride)),
                   int(np.ceil(self.patch_size[2] / stride_z))]
                  for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z']
                                              )])
 
         if self.model == 'ufrcnn':
             self.operate_stride1 = True
             self.class_specific_seg_flag = True
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
             self.frcnn_mode = True
 
         if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector':
             # implement extra anchor-scales according to retina-net publication.
             self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                             self.rpn_anchor_scales['xy']]
             self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                            self.rpn_anchor_scales['z']]
             self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3
 
             self.n_rpn_features = 256 if self.dim == 2 else 64
 
             # pre-selection of detections for NMS-speedup. per entire batch.
             self.pre_nms_limit = 10000 if self.dim == 2 else 50000
 
             # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002
             self.anchor_matching_iou = 0.5
 
             # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic).
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
 
             if self.model == 'retina_unet':
                 self.operate_stride1 = True
\ No newline at end of file
diff --git a/experiments/toy_exp/configs.py b/experiments/toy_exp/configs.py
index 2e8c5bf..f2c0426 100644
--- a/experiments/toy_exp/configs.py
+++ b/experiments/toy_exp/configs.py
@@ -1,344 +1,344 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 import numpy as np
 from default_configs import DefaultConfigs
 
 class configs(DefaultConfigs):
 
     def __init__(self, server_env=None):
 
         #########################
         #    Preprocessing      #
         #########################
 
         self.root_dir = '/mnt/HDD2TB/Documents/data/mdt_toy'
 
         #########################
         #         I/O           #
         #########################
 
 
         # one out of [2, 3]. dimension the model operates in.
         self.dim = 2
 
-        # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn', 'detection_unet'].
-        self.model = 'detection_unet'
+        # one out of ['mrcnn', 'retina_net', 'retina_unet', 'detection_unet', 'ufrcnn'].
+        self.model = 'mrcnn'
 
         DefaultConfigs.__init__(self, self.model, server_env, self.dim)
 
         # int [0 < dataset_size]. select n patients from dataset for prototyping.
         self.select_prototype_subset = None
         self.hold_out_test_set = True
         self.n_train_data = 1000
 
         # choose one of the 3 toy experiments described in https://arxiv.org/pdf/1811.08661.pdf
         # one of ['donuts_shape', 'donuts_pattern', 'circles_scale'].
         toy_mode = 'donuts_shape'
 
 
         # path to preprocessed data.
         self.input_df_name = 'info_df.pickle'
         self.pp_name = os.path.join(toy_mode, 'train')
         self.pp_data_path = os.path.join(self.root_dir, self.pp_name)
         self.pp_test_name = os.path.join(toy_mode, 'test')
         self.pp_test_data_path = os.path.join(self.root_dir, self.pp_test_name)
 
         # settings for deployment in cloud.
         if server_env:
             # path to preprocessed data.
             pp_root_dir = '/path/to/data'
             self.pp_name = os.path.join(toy_mode, 'train')
             self.pp_data_path = os.path.join(pp_root_dir, self.pp_name)
             self.pp_test_name = os.path.join(toy_mode, 'test')
             self.pp_test_data_path = os.path.join(pp_root_dir, self.pp_test_name)
             self.select_prototype_subset = None
 
         #########################
         #      Data Loader      #
         #########################
 
         # select modalities from preprocessed data
         self.channels = [0]
         self.n_channels = len(self.channels)
 
         # patch_size to be used for training. pre_crop_size is the patch_size before data augmentation.
         self.pre_crop_size_2D = [320, 320]
         self.patch_size_2D = [320, 320]
 
         self.patch_size = self.patch_size_2D if self.dim == 2 else self.patch_size_3D
         self.pre_crop_size = self.pre_crop_size_2D if self.dim == 2 else self.pre_crop_size_3D
 
         # ratio of free sampled batch elements before class balancing is triggered
         # (>0 to include "empty"/background patches.)
         self.batch_sample_slack = 0.2
 
         # set 2D network to operate in 3D images.
         self.merge_2D_to_3D_preds = False
 
         # feed +/- n neighbouring slices into channel dimension. set to None for no context.
         self.n_3D_context = None
         if self.n_3D_context is not None and self.dim == 2:
             self.n_channels *= (self.n_3D_context * 2 + 1)
 
 
         #########################
         #      Architecture      #
         #########################
 
         self.start_filts = 48 if self.dim == 2 else 18
         self.end_filts = self.start_filts * 4 if self.dim == 2 else self.start_filts * 2
         self.res_architecture = 'resnet50' # 'resnet101' , 'resnet50'
         self.norm = None # one of None, 'instance_norm', 'batch_norm'
         self.weight_decay = 0
 
         # one of 'xavier_uniform', 'xavier_normal', or 'kaiming_normal', None (=default = 'kaiming_uniform')
         self.weight_init = None
 
         #########################
         #  Schedule / Selection #
         #########################
 
         self.num_epochs = 100
         self.num_train_batches = 200 if self.dim == 2 else 200
         self.batch_size = 20 if self.dim == 2 else 8
 
         self.do_validation = True
         # decide whether to validate on entire patient volumes (like testing) or sampled patches (like training)
         # the former is morge accurate, while the latter is faster (depending on volume size)
         self.val_mode = 'val_patient' # one of 'val_sampling' , 'val_patient'
         if self.val_mode == 'val_patient':
             self.max_val_patients = None  # if 'None' iterates over entire val_set once.
         if self.val_mode == 'val_sampling':
             self.num_val_batches = 50
 
         #########################
         #   Testing / Plotting  #
         #########################
 
         # set the top-n-epochs to be saved for temporal averaging in testing.
         self.save_n_models = 5
         self.test_n_epochs = 5
 
         # set a minimum epoch number for saving in case of instabilities in the first phase of training.
         self.min_save_thresh = 0 if self.dim == 2 else 0
 
         self.report_score_level = ['patient', 'rois']  # choose list from 'patient', 'rois'
         self.class_dict = {1: 'benign', 2: 'malignant'}  # 0 is background.
         self.patient_class_of_interest = 2  # patient metrics are only plotted for one class.
         self.ap_match_ious = [0.1]  # list of ious to be evaluated for ap-scoring.
 
         self.model_selection_criteria = ['benign_ap', 'malignant_ap'] # criteria to average over for saving epochs.
         self.min_det_thresh = 0.1  # minimum confidence value to select predictions for evaluation.
 
         # threshold for clustering predictions together (wcs = weighted cluster scoring).
         # needs to be >= the expected overlap of predictions coming from one model (typically NMS threshold).
         # if too high, preds of the same object are separate clusters.
         self.wcs_iou = 1e-5
 
         self.plot_prediction_histograms = True
         self.plot_stat_curves = False
 
         #########################
         #   Data Augmentation   #
         #########################
 
         self.da_kwargs={
         'do_elastic_deform': True,
         'alpha':(0., 1500.),
         'sigma':(30., 50.),
         'do_rotation':True,
         'angle_x': (0., 2 * np.pi),
         'angle_y': (0., 0),
         'angle_z': (0., 0),
         'do_scale': True,
         'scale':(0.8, 1.1),
         'random_crop':False,
         'rand_crop_dist':  (self.patch_size[0] / 2. - 3, self.patch_size[1] / 2. - 3),
         'border_mode_data': 'constant',
         'border_cval_data': 0,
         'order_data': 1
         }
 
         if self.dim == 3:
             self.da_kwargs['do_elastic_deform'] = False
             self.da_kwargs['angle_x'] = (0, 0.0)
             self.da_kwargs['angle_y'] = (0, 0.0) #must be 0!!
             self.da_kwargs['angle_z'] = (0., 2 * np.pi)
 
 
         #########################
         #   Add model specifics #
         #########################
 
         {'detection_unet': self.add_det_unet_configs,
          'mrcnn': self.add_mrcnn_configs,
          'ufrcnn': self.add_mrcnn_configs,
          'ufrcnn_surrounding': self.add_mrcnn_configs,
          'retina_net': self.add_mrcnn_configs,
          'retina_unet': self.add_mrcnn_configs,
          'prob_detector': self.add_mrcnn_configs,
         }[self.model]()
 
 
     def add_det_unet_configs(self):
 
         self.learning_rate = [1e-4] * self.num_epochs
 
         # aggregation from pixel perdiction to object scores (connected component). One of ['max', 'median']
         self.aggregation_operation = 'max'
 
         # max number of roi candidates to identify per image (slice in 2D, volume in 3D)
         self.n_roi_candidates = 3 if self.dim == 2 else 8
 
         # loss mode: either weighted cross entropy ('wce'), batch-wise dice loss ('dice), or the sum of both ('dice_wce')
         self.seg_loss_mode = 'dice_wce'
 
         # if <1, false positive predictions in foreground are penalized less.
         self.fp_dice_weight = 1 if self.dim == 2 else 1
 
         self.wce_weights = [1, 1, 1]
         self.detection_min_confidence = self.min_det_thresh
 
         # if 'True', loss distinguishes all classes, else only foreground vs. background (class agnostic).
         self.class_specific_seg_flag = True
         self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
         self.head_classes = self.num_seg_classes
 
     def add_mrcnn_configs(self):
 
         # learning rate is a list with one entry per epoch.
         self.learning_rate = [1e-4] * self.num_epochs
 
         # disable mask head loss. (e.g. if no pixelwise annotations available)
         self.frcnn_mode = False
 
         # disable the re-sampling of mask proposals to original size for speed-up.
         # since evaluation is detection-driven (box-matching) and not instance segmentation-driven (iou-matching),
         # mask-outputs are optional.
         self.return_masks_in_val = True
         self.return_masks_in_test = False
 
         # set number of proposal boxes to plot after each epoch.
         self.n_plot_rpn_props = 5 if self.dim == 2 else 30
 
         # number of classes for head networks: n_foreground_classes + 1 (background)
         self.head_classes = 3
 
         # seg_classes hier refers to the first stage classifier (RPN)
         self.num_seg_classes = 2  # foreground vs. background
 
         # feature map strides per pyramid level are inferred from architecture.
         self.backbone_strides = {'xy': [4, 8, 16, 32], 'z': [1, 2, 4, 8]}
 
         # anchor scales are chosen according to expected object sizes in data set. Default uses only one anchor scale
         # per pyramid level. (outer list are pyramid levels (corresponding to BACKBONE_STRIDES), inner list are scales per level.)
         self.rpn_anchor_scales = {'xy': [[8], [16], [32], [64]], 'z': [[2], [4], [8], [16]]}
 
         # choose which pyramid levels to extract features from: P2: 0, P3: 1, P4: 2, P5: 3.
         self.pyramid_levels = [0, 1, 2, 3]
 
         # number of feature maps in rpn. typically lowered in 3D to save gpu-memory.
         self.n_rpn_features = 512 if self.dim == 2 else 128
 
         # anchor ratios and strides per position in feature maps.
         self.rpn_anchor_ratios = [0.5, 1, 2]
         self.rpn_anchor_stride = 1
 
         # Threshold for first stage (RPN) non-maximum suppression (NMS):  LOWER == HARDER SELECTION
         self.rpn_nms_threshold = 0.7 if self.dim == 2 else 0.7
 
         # loss sampling settings.
         self.rpn_train_anchors_per_image = 2  #per batch element
         self.train_rois_per_image = 2 #per batch element
         self.roi_positive_ratio = 0.5
         self.anchor_matching_iou = 0.7
 
         # factor of top-k candidates to draw from  per negative sample (stochastic-hard-example-mining).
         # poolsize to draw top-k candidates from will be shem_poolsize * n_negative_samples.
         self.shem_poolsize = 10
 
         self.pool_size = (7, 7) if self.dim == 2 else (7, 7, 3)
         self.mask_pool_size = (14, 14) if self.dim == 2 else (14, 14, 5)
         self.mask_shape = (28, 28) if self.dim == 2 else (28, 28, 10)
 
         self.rpn_bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.bbox_std_dev = np.array([0.1, 0.1, 0.1, 0.2, 0.2, 0.2])
         self.window = np.array([0, 0, self.patch_size[0], self.patch_size[1]])
         self.scale = np.array([self.patch_size[0], self.patch_size[1], self.patch_size[0], self.patch_size[1]])
 
         if self.dim == 2:
             self.rpn_bbox_std_dev = self.rpn_bbox_std_dev[:4]
             self.bbox_std_dev = self.bbox_std_dev[:4]
             self.window = self.window[:4]
             self.scale = self.scale[:4]
 
         # pre-selection in proposal-layer (stage 1) for NMS-speedup. applied per batch element.
         self.pre_nms_limit = 3000 if self.dim == 2 else 6000
 
         # n_proposals to be selected after NMS per batch element. too high numbers blow up memory if "detect_while_training" is True,
         # since proposals of the entire batch are forwarded through second stage in as one "batch".
         self.roi_chunk_size = 800 if self.dim == 2 else 600
         self.post_nms_rois_training = 500 if self.dim == 2 else 75
         self.post_nms_rois_inference = 500
 
         # Final selection of detections (refine_detections)
         self.model_max_instances_per_batch_element = 10 if self.dim == 2 else 30  # per batch element and class.
         self.detection_nms_threshold = 1e-5  # needs to be > 0, otherwise all predictions are one cluster.
         self.model_min_confidence = 0.1
 
         if self.dim == 2:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride))]
                  for stride in self.backbone_strides['xy']])
         else:
             self.backbone_shapes = np.array(
                 [[int(np.ceil(self.patch_size[0] / stride)),
                   int(np.ceil(self.patch_size[1] / stride)),
                   int(np.ceil(self.patch_size[2] / stride_z))]
                  for stride, stride_z in zip(self.backbone_strides['xy'], self.backbone_strides['z']
                                              )])
         if self.model == 'ufrcnn':
             self.operate_stride1 = True
             self.class_specific_seg_flag = True
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
             self.frcnn_mode = True
 
         if self.model == 'retina_net' or self.model == 'retina_unet' or self.model == 'prob_detector':
             # implement extra anchor-scales according to retina-net publication.
             self.rpn_anchor_scales['xy'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                             self.rpn_anchor_scales['xy']]
             self.rpn_anchor_scales['z'] = [[ii[0], ii[0] * (2 ** (1 / 3)), ii[0] * (2 ** (2 / 3))] for ii in
                                            self.rpn_anchor_scales['z']]
             self.n_anchors_per_pos = len(self.rpn_anchor_ratios) * 3
 
             self.n_rpn_features = 256 if self.dim == 2 else 64
 
             # pre-selection of detections for NMS-speedup. per entire batch.
             self.pre_nms_limit = 10000 if self.dim == 2 else 50000
 
             # anchor matching iou is lower than in Mask R-CNN according to https://arxiv.org/abs/1708.02002
             self.anchor_matching_iou = 0.5
 
             # if 'True', seg loss distinguishes all classes, else only foreground vs. background (class agnostic).
             self.num_seg_classes = 3 if self.class_specific_seg_flag else 2
 
             if self.model == 'retina_unet':
                 self.operate_stride1 = True
diff --git a/models/mrcnn.py b/models/mrcnn.py
index ba792b5..0ba929c 100644
--- a/models/mrcnn.py
+++ b/models/mrcnn.py
@@ -1,1083 +1,1177 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Parts are based on https://github.com/multimodallearning/pytorch-mask-rcnn
 published under MIT license.
 """
-
-import utils.model_utils as mutils
-import utils.exp_utils as utils
-from cuda_functions.nms_2D.pth_nms import nms_gpu as nms_2D
-from cuda_functions.nms_3D.pth_nms import nms_gpu as nms_3D
-from cuda_functions.roi_align_2D.roi_align.crop_and_resize import CropAndResizeFunction as ra2D
-from cuda_functions.roi_align_3D.roi_align.crop_and_resize import CropAndResizeFunction as ra3D
+import sys
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils
 
+sys.path.append("..")
+import utils.model_utils as mutils
+import utils.exp_utils as utils
+from custom_extensions.nms import nms
+from custom_extensions.roi_align import roi_align
 
 ############################################################
 # Networks on top of backbone
 ############################################################
 
 class RPN(nn.Module):
     """
     Region Proposal Network.
     """
 
     def __init__(self, cf, conv):
 
         super(RPN, self).__init__()
         self.dim = conv.dim
 
         self.conv_shared = conv(cf.end_filts, cf.n_rpn_features, ks=3, stride=cf.rpn_anchor_stride, pad=1, relu=cf.relu)
         self.conv_class = conv(cf.n_rpn_features, 2 * len(cf.rpn_anchor_ratios), ks=1, stride=1, relu=None)
         self.conv_bbox = conv(cf.n_rpn_features, 2 * self.dim * len(cf.rpn_anchor_ratios), ks=1, stride=1, relu=None)
 
 
     def forward(self, x):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :return: rpn_class_logits (b, 2, n_anchors)
         :return: rpn_probs_logits (b, 2, n_anchors)
         :return: rpn_bbox (b, 2 * dim, n_anchors)
         """
 
         # Shared convolutional base of the RPN.
         x = self.conv_shared(x)
 
         # Anchor Score. (batch, anchors per location * 2, y, x, (z)).
         rpn_class_logits = self.conv_class(x)
         # Reshape to (batch, 2, anchors)
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         rpn_class_logits = rpn_class_logits.permute(*axes)
         rpn_class_logits = rpn_class_logits.contiguous()
         rpn_class_logits = rpn_class_logits.view(x.size()[0], -1, 2)
 
         # Softmax on last dimension (fg vs. bg).
         rpn_probs = F.softmax(rpn_class_logits, dim=2)
 
         # Bounding box refinement. (batch, anchors_per_location * (y, x, (z), log(h), log(w), (log(d)), y, x, (z))
         rpn_bbox = self.conv_bbox(x)
 
         # Reshape to (batch, 2*dim, anchors)
         rpn_bbox = rpn_bbox.permute(*axes)
         rpn_bbox = rpn_bbox.contiguous()
         rpn_bbox = rpn_bbox.view(x.size()[0], -1, self.dim * 2)
 
         return [rpn_class_logits, rpn_probs, rpn_bbox]
 
 
 
 class Classifier(nn.Module):
     """
     Head network for classification and bounding box refinement. Performs RoiAlign, processes resulting features through a
     shared convolutional base and finally branches off the classifier- and regression head.
     """
     def __init__(self, cf, conv):
         super(Classifier, self).__init__()
 
         self.dim = conv.dim
         self.in_channels = cf.end_filts
         self.pool_size = cf.pool_size
         self.pyramid_levels = cf.pyramid_levels
         # instance_norm does not work with spatial dims (1, 1, (1))
         norm = cf.norm if cf.norm != 'instance_norm' else None
 
         self.conv1 = conv(cf.end_filts, cf.end_filts * 4, ks=self.pool_size, stride=1, norm=norm, relu=cf.relu)
         self.conv2 = conv(cf.end_filts * 4, cf.end_filts * 4, ks=1, stride=1, norm=norm, relu=cf.relu)
         self.linear_class = nn.Linear(cf.end_filts * 4, cf.head_classes)
         self.linear_bbox = nn.Linear(cf.end_filts * 4, cf.head_classes * 2 * self.dim)
 
     def forward(self, x, rois):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :param rois: normalized box coordinates as proposed by the RPN to be forwarded through
         the second stage (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix). Proposals of all batch elements
         have been merged to one vector, while the origin info has been stored for re-allocation.
         :return: mrcnn_class_logits (n_proposals, n_head_classes)
         :return: mrcnn_bbox (n_proposals, n_head_classes, 2 * dim) predicted corrections to be applied to proposals for refinement.
         """
         x = pyramid_roi_align(x, rois, self.pool_size, self.pyramid_levels, self.dim)
         x = self.conv1(x)
         x = self.conv2(x)
         x = x.view(-1, self.in_channels * 4)
         mrcnn_class_logits = self.linear_class(x)
         mrcnn_bbox = self.linear_bbox(x)
         mrcnn_bbox = mrcnn_bbox.view(mrcnn_bbox.size()[0], -1, self.dim * 2)
 
         return [mrcnn_class_logits, mrcnn_bbox]
 
 
 
 class Mask(nn.Module):
     """
     Head network for proposal-based mask segmentation. Performs RoiAlign, some convolutions and applies sigmoid on the
     output logits to allow for overlapping classes.
     """
     def __init__(self, cf, conv):
         super(Mask, self).__init__()
         self.pool_size = cf.mask_pool_size
         self.pyramid_levels = cf.pyramid_levels
         self.dim = conv.dim
         self.conv1 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv2 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv3 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv4 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         if conv.dim == 2:
             self.deconv = nn.ConvTranspose2d(cf.end_filts, cf.end_filts, kernel_size=2, stride=2)
         else:
             self.deconv = nn.ConvTranspose3d(cf.end_filts, cf.end_filts, kernel_size=2, stride=2)
 
         self.relu = nn.ReLU(inplace=True) if cf.relu == 'relu' else nn.LeakyReLU(inplace=True)
         self.conv5 = conv(cf.end_filts, cf.head_classes, ks=1, stride=1, relu=None)
         self.sigmoid = nn.Sigmoid()
 
     def forward(self, x, rois):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :param rois: normalized box coordinates as proposed by the RPN to be forwarded through
         the second stage (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix). Proposals of all batch elements
         have been merged to one vector, while the origin info has been stored for re-allocation.
         :return: x: masks (n_sampled_proposals (n_detections in inference), n_classes, y, x, (z))
         """
         x = pyramid_roi_align(x, rois, self.pool_size, self.pyramid_levels, self.dim)
         x = self.conv1(x)
         x = self.conv2(x)
         x = self.conv3(x)
         x = self.conv4(x)
         x = self.relu(self.deconv(x))
         x = self.conv5(x)
         x = self.sigmoid(x)
         return x
 
 
 ############################################################
 #  Loss Functions
 ############################################################
 
 def compute_rpn_class_loss(rpn_match, rpn_class_logits, shem_poolsize):
     """
     :param rpn_match: (n_anchors). [-1, 0, 1] for negative, neutral, and positive matched anchors.
     :param rpn_class_logits: (n_anchors, 2). logits from RPN classifier.
     :param shem_poolsize: int. factor of top-k candidates to draw from per negative sample
     (stochastic-hard-example-mining).
     :return: loss: torch tensor
     :return: np_neg_ix: 1D array containing indices of the neg_roi_logits, which have been sampled for training.
     """
 
     # filter out neutral anchors.
     pos_indices = torch.nonzero(rpn_match == 1)
     neg_indices = torch.nonzero(rpn_match == -1)
 
     # loss for positive samples
     if 0 not in pos_indices.size():
         pos_indices = pos_indices.squeeze(1)
         roi_logits_pos = rpn_class_logits[pos_indices]
         pos_loss = F.cross_entropy(roi_logits_pos, torch.LongTensor([1] * pos_indices.shape[0]).cuda())
     else:
         pos_loss = torch.FloatTensor([0]).cuda()
 
     # loss for negative samples: draw hard negative examples (SHEM)
     # that match the number of positive samples, but at least 1.
     if 0 not in neg_indices.size():
         neg_indices = neg_indices.squeeze(1)
         roi_logits_neg = rpn_class_logits[neg_indices]
         negative_count = np.max((1, pos_indices.cpu().data.numpy().size))
         roi_probs_neg = F.softmax(roi_logits_neg, dim=1)
         neg_ix = mutils.shem(roi_probs_neg, negative_count, shem_poolsize)
         neg_loss = F.cross_entropy(roi_logits_neg[neg_ix], torch.LongTensor([0] * neg_ix.shape[0]).cuda())
         np_neg_ix = neg_ix.cpu().data.numpy()
     else:
         neg_loss = torch.FloatTensor([0]).cuda()
         np_neg_ix = np.array([]).astype('int32')
 
     loss = (pos_loss + neg_loss) / 2
     return loss, np_neg_ix
 
 
 def compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas, rpn_match):
     """
     :param rpn_target_deltas:   (b, n_positive_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd)))).
     Uses 0 padding to fill in unsed bbox deltas.
     :param rpn_pred_deltas: predicted deltas from RPN. (b, n_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd))))
     :param rpn_match: (n_anchors). [-1, 0, 1] for negative, neutral, and positive matched anchors.
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(rpn_match == 1).size():
 
         indices = torch.nonzero(rpn_match == 1).squeeze(1)
         # Pick bbox deltas that contribute to the loss
         rpn_pred_deltas = rpn_pred_deltas[indices]
         # Trim target bounding box deltas to the same length as rpn_bbox.
         target_deltas = rpn_target_deltas[:rpn_pred_deltas.size()[0], :]
         # Smooth L1 loss
         loss = F.smooth_l1_loss(rpn_pred_deltas, target_deltas)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 def compute_mrcnn_class_loss(target_class_ids, pred_class_logits):
     """
     :param target_class_ids: (n_sampled_rois) batch dimension was merged into roi dimension.
     :param pred_class_logits: (n_sampled_rois, n_classes)
     :return: loss: torch 1D tensor.
     """
     if 0 not in target_class_ids.size():
         loss = F.cross_entropy(pred_class_logits, target_class_ids.long())
     else:
         loss = torch.FloatTensor([0.]).cuda()
 
     return loss
 
 
 def compute_mrcnn_bbox_loss(mrcnn_target_deltas, mrcnn_pred_deltas, target_class_ids):
     """
     :param mrcnn_target_deltas: (n_sampled_rois, (dy, dx, (dz), log(dh), log(dw), (log(dh)))
     :param mrcnn_pred_deltas: (n_sampled_rois, n_classes, (dy, dx, (dz), log(dh), log(dw), (log(dh)))
     :param target_class_ids: (n_sampled_rois)
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(target_class_ids > 0).size():
         positive_roi_ix = torch.nonzero(target_class_ids > 0)[:, 0]
         positive_roi_class_ids = target_class_ids[positive_roi_ix].long()
         target_bbox = mrcnn_target_deltas[positive_roi_ix, :].detach()
         pred_bbox = mrcnn_pred_deltas[positive_roi_ix, positive_roi_class_ids, :]
         loss = F.smooth_l1_loss(pred_bbox, target_bbox)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 def compute_mrcnn_mask_loss(target_masks, pred_masks, target_class_ids):
     """
     :param target_masks: (n_sampled_rois, y, x, (z)) A float32 tensor of values 0 or 1. Uses zero padding to fill array.
     :param pred_masks: (n_sampled_rois, n_classes, y, x, (z)) float32 tensor with values between [0, 1].
     :param target_class_ids: (n_sampled_rois)
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(target_class_ids > 0).size():
         # Only positive ROIs contribute to the loss. And only
         # the class specific mask of each ROI.
         positive_ix = torch.nonzero(target_class_ids > 0)[:, 0]
         positive_class_ids = target_class_ids[positive_ix].long()
         y_true = target_masks[positive_ix, :, :].detach()
         y_pred = pred_masks[positive_ix, positive_class_ids, :, :]
         loss = F.binary_cross_entropy(y_pred, y_true)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 ############################################################
 #  Helper Layers
 ############################################################
 
-def proposal_layer(rpn_pred_probs, rpn_pred_deltas, proposal_count, anchors, cf):
+def refine_proposals(rpn_pred_probs, rpn_pred_deltas, proposal_count, batch_anchors, cf):
     """
     Receives anchor scores and selects a subset to pass as proposals
     to the second stage. Filtering is done based on anchor scores and
     non-max suppression to remove overlaps. It also applies bounding
-    box refinment detals to anchors.
+    box refinment details to anchors.
     :param rpn_pred_probs: (b, n_anchors, 2)
     :param rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
-    :return: batch_normalized_boxes: Proposals in normalized coordinates
-    (b, proposal_count, (y1, x1, y2, x2, (z1), (z2)))
+    :return: batch_normalized_props: Proposals in normalized coordinates (b, proposal_count, (y1, x1, y2, x2, (z1), (z2), score))
     :return: batch_out_proposals: Box coords + RPN foreground scores
     for monitoring/plotting (b, proposal_count, (y1, x1, y2, x2, (z1), (z2), score))
     """
+    std_dev = torch.from_numpy(cf.rpn_bbox_std_dev[None]).float().cuda()
+    norm = torch.from_numpy(cf.scale).float().cuda()
+    anchors = batch_anchors.clone()
+
+
+
     batch_scores = rpn_pred_probs[:, :, 1]
-    batch_deltas = rpn_pred_deltas
-    batch_anchors = anchors
-    batch_normalized_boxes = []
+    # norm deltas
+    batch_deltas = rpn_pred_deltas * std_dev
+    batch_normalized_props = []
     batch_out_proposals = []
 
     # loop over batch dimension.
     for ix in range(batch_scores.shape[0]):
 
         scores = batch_scores[ix]
         deltas = batch_deltas[ix]
-        anchors = batch_anchors.clone()
-        # norm deltas
-        std_dev = torch.from_numpy(cf.rpn_bbox_std_dev[None]).float().cuda()
-        deltas = deltas * std_dev
 
         # improve performance by trimming to top anchors by score
         # and doing the rest on the smaller subset.
         pre_nms_limit = min(cf.pre_nms_limit, anchors.size()[0])
         scores, order = scores.sort(descending=True)
         order = order[:pre_nms_limit]
         scores = scores[:pre_nms_limit]
         deltas = deltas[order, :]
-        anchors = anchors[order, :]
 
-        # apply deltas to anchors to get refined anchors and filter with non-maximum surpression.
+        # apply deltas to anchors to get refined anchors and filter with non-maximum suppression.
         if batch_deltas.shape[-1] == 4:
-            boxes = mutils.apply_box_deltas_2D(anchors, deltas)
+            boxes = mutils.apply_box_deltas_2D(anchors[order, :], deltas)
             boxes = mutils.clip_boxes_2D(boxes, cf.window)
-            keep = nms_2D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
-            norm = torch.from_numpy(cf.scale).float().cuda()
-
         else:
-            boxes = mutils.apply_box_deltas_3D(anchors, deltas)
+            boxes = mutils.apply_box_deltas_3D(anchors[order, :], deltas)
             boxes = mutils.clip_boxes_3D(boxes, cf.window)
-            keep = nms_3D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
-            norm = torch.from_numpy(cf.scale).float().cuda()
+        # boxes are y1,x1,y2,x2, torchvision-nms requires x1,y1,x2,y2, but consistent swap x<->y is irrelevant.
+        keep = nms.nms(boxes, scores, cf.rpn_nms_threshold)
+
 
         keep = keep[:proposal_count]
         boxes = boxes[keep, :]
         rpn_scores = scores[keep][:, None]
 
         # pad missing boxes with 0.
         if boxes.shape[0] < proposal_count:
             n_pad_boxes = proposal_count - boxes.shape[0]
             zeros = torch.zeros([n_pad_boxes, boxes.shape[1]]).cuda()
             boxes = torch.cat([boxes, zeros], dim=0)
             zeros = torch.zeros([n_pad_boxes, rpn_scores.shape[1]]).cuda()
             rpn_scores = torch.cat([rpn_scores, zeros], dim=0)
 
         # concat box and score info for monitoring/plotting.
         batch_out_proposals.append(torch.cat((boxes, rpn_scores), 1).cpu().data.numpy())
         # normalize dimensions to range of 0 to 1.
         normalized_boxes = boxes / norm
-        # add back batch dimension
-        batch_normalized_boxes.append(normalized_boxes.unsqueeze(0))
+        assert torch.all(normalized_boxes <= 1), "normalized box coords >1 found"
+
+        # add again batch dimension
+        batch_normalized_props.append(normalized_boxes.unsqueeze(0))
 
-    batch_normalized_boxes = torch.cat(batch_normalized_boxes)
+    batch_normalized_props = torch.cat(batch_normalized_props)
     batch_out_proposals = np.array(batch_out_proposals)
-    return batch_normalized_boxes, batch_out_proposals
 
+    return batch_normalized_props, batch_out_proposals
 
 
 def pyramid_roi_align(feature_maps, rois, pool_size, pyramid_levels, dim):
     """
     Implements ROI Pooling on multiple levels of the feature pyramid.
     :param feature_maps: list of feature maps, each of shape (b, c, y, x , (z))
     :param rois: proposals (normalized coords.) as returned by RPN. contain info about original batch element allocation.
     (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ixs)
     :param pool_size: list of poolsizes in dims: [x, y, (z)]
     :param pyramid_levels: list. [0, 1, 2, ...]
     :return: pooled: pooled feature map rois (n_proposals, c, poolsize_y, poolsize_x, (poolsize_z))
 
     Output:
     Pooled regions in the shape: [num_boxes, height, width, channels].
     The width and height are those specific in the pool_shape in the layer
     constructor.
     """
     boxes = rois[:, :dim*2]
     batch_ixs = rois[:, dim*2]
 
     # Assign each ROI to a level in the pyramid based on the ROI area.
     if dim == 2:
         y1, x1, y2, x2 = boxes.chunk(4, dim=1)
     else:
         y1, x1, y2, x2, z1, z2 = boxes.chunk(6, dim=1)
 
     h = y2 - y1
     w = x2 - x1
 
     # Equation 1 in https://arxiv.org/abs/1612.03144. Account for
     # the fact that our coordinates are normalized here.
     # divide sqrt(h*w) by 1 instead image_area.
-    roi_level = (4 + mutils.log2(torch.sqrt(h*w))).round().int().clamp(pyramid_levels[0], pyramid_levels[-1])
-    # if Pyramid contains additional level P6, adapt the roi_level assignemnt accordingly.
+    roi_level = (4 + torch.log2(torch.sqrt(h*w))).round().int().clamp(pyramid_levels[0], pyramid_levels[-1])
+    # if Pyramid contains additional level P6, adapt the roi_level assignment accordingly.
     if len(pyramid_levels) == 5:
         roi_level[h*w > 0.65] = 5
 
     # Loop through levels and apply ROI pooling to each.
     pooled = []
     box_to_level = []
+    fmap_shapes = [f.shape for f in feature_maps]
     for level_ix, level in enumerate(pyramid_levels):
         ix = roi_level == level
         if not ix.any():
             continue
         ix = torch.nonzero(ix)[:, 0]
         level_boxes = boxes[ix, :]
         # re-assign rois to feature map of original batch element.
         ind = batch_ixs[ix].int()
 
         # Keep track of which box is mapped to which level
         box_to_level.append(ix)
 
         # Stop gradient propogation to ROI proposals
         level_boxes = level_boxes.detach()
-
-        # Crop and Resize
-        # From Mask R-CNN paper: "We sample four regular locations, so
-        # that we can evaluate either max or average pooling. In fact,
-        # interpolating only a single value at each bin center (without
-        # pooling) is nearly as effective."
-        #
-        # Here we use the simplified approach of a single value per bin,
-        # which is how is done in tf.crop_and_resize()
-        #
-        # Also fixed a bug from original implementation, reported in:
-        # https://hackernoon.com/how-tensorflows-tf-image-resize-stole-60-days-of-my-life-aba5eb093f35
-
         if len(pool_size) == 2:
-            pooled_features = ra2D(pool_size[0], pool_size[1], 0)(feature_maps[level_ix], level_boxes, ind)
+            # remap to feature map coordinate system
+            y_exp, x_exp = fmap_shapes[level_ix][2:]  # exp = expansion
+            level_boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp], dtype=torch.float32).cuda())
+            pooled_features = roi_align.roi_align_2d(feature_maps[level_ix],
+                                                     torch.cat((ind.unsqueeze(1).float(), level_boxes), dim=1),
+                                                     pool_size)
         else:
-            pooled_features = ra3D(pool_size[0], pool_size[1], pool_size[2], 0)(feature_maps[level_ix], level_boxes, ind)
-
+            y_exp, x_exp, z_exp = fmap_shapes[level_ix][2:]
+            level_boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp, z_exp, z_exp], dtype=torch.float32).cuda())
+            pooled_features = roi_align.roi_align_3d(feature_maps[level_ix],
+                                                     torch.cat((ind.unsqueeze(1).float(), level_boxes), dim=1),
+                                                     pool_size)
         pooled.append(pooled_features)
 
 
     # Pack pooled features into one tensor
     pooled = torch.cat(pooled, dim=0)
 
     # Pack box_to_level mapping into one array and add another
     # column representing the order of pooled boxes
     box_to_level = torch.cat(box_to_level, dim=0)
 
     # Rearrange pooled features to match the order of the original boxes
     _, box_to_level = torch.sort(box_to_level)
     pooled = pooled[box_to_level, :, :]
 
     return pooled
 
 
-
 def detection_target_layer(batch_proposals, batch_mrcnn_class_scores, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks, cf):
     """
     Subsamples proposals for mrcnn losses and generates targets. Sampling is done per batch element, seems to have positive
     effects on training, as opposed to sampling over entire batch. Negatives are sampled via stochastic-hard-example-mining
     (SHEM), where a number of negative proposals are drawn from larger pool of highest scoring proposals for stochasticity.
     Scoring is obtained here as the max over all foreground probabilities as returned by mrcnn_classifier (worked better than
     loss-based class balancing methods like "online-hard-example-mining" or "focal loss".)
 
     :param batch_proposals: (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ixs).
     boxes as proposed by RPN. n_proposals here is determined by batch_size * POST_NMS_ROIS.
     :param batch_mrcnn_class_scores: (n_proposals, n_classes)
     :param batch_gt_class_ids: list over batch elements. Each element is a list over the corresponding roi target labels.
     :param batch_gt_boxes: list over batch elements. Each element is a list over the corresponding roi target coordinates.
     :param batch_gt_masks: list over batch elements. Each element is binary mask of shape (n_gt_rois, y, x, (z), c)
     :return: sample_indices: (n_sampled_rois) indices of sampled proposals to be used for loss functions.
     :return: target_class_ids: (n_sampled_rois)containing target class labels of sampled proposals.
     :return: target_deltas: (n_sampled_rois, 2 * dim) containing target deltas of sampled proposals for box refinement.
     :return: target_masks: (n_sampled_rois, y, x, (z)) containing target masks of sampled proposals.
     """
     # normalization of target coordinates
     if cf.dim == 2:
         h, w = cf.patch_size
         scale = torch.from_numpy(np.array([h, w, h, w])).float().cuda()
     else:
         h, w, z = cf.patch_size
         scale = torch.from_numpy(np.array([h, w, h, w, z, z])).float().cuda()
 
-
     positive_count = 0
     negative_count = 0
     sample_positive_indices = []
     sample_negative_indices = []
     sample_deltas = []
     sample_masks = []
     sample_class_ids = []
 
+    std_dev = torch.from_numpy(cf.bbox_std_dev).float().cuda()
+
     # loop over batch and get positive and negative sample rois.
     for b in range(len(batch_gt_class_ids)):
 
         gt_class_ids = torch.from_numpy(batch_gt_class_ids[b]).int().cuda()
         gt_masks = torch.from_numpy(batch_gt_masks[b]).float().cuda()
         if np.any(batch_gt_class_ids[b] > 0):  # skip roi selection for no gt images.
             gt_boxes = torch.from_numpy(batch_gt_boxes[b]).float().cuda() / scale
         else:
             gt_boxes = torch.FloatTensor().cuda()
 
         # get proposals and indices of current batch element.
         proposals = batch_proposals[batch_proposals[:, -1] == b][:, :-1]
         batch_element_indices = torch.nonzero(batch_proposals[:, -1] == b).squeeze(1)
 
         # Compute overlaps matrix [proposals, gt_boxes]
         if 0 not in gt_boxes.size():
             if gt_boxes.shape[1] == 4:
+                assert cf.dim == 2, "gt_boxes shape {} doesnt match cf.dim{}".format(gt_boxes.shape, cf.dim)
                 overlaps = mutils.bbox_overlaps_2D(proposals, gt_boxes)
             else:
+                assert cf.dim == 3, "gt_boxes shape {} doesnt match cf.dim{}".format(gt_boxes.shape, cf.dim)
                 overlaps = mutils.bbox_overlaps_3D(proposals, gt_boxes)
 
             # Determine postive and negative ROIs
             roi_iou_max = torch.max(overlaps, dim=1)[0]
             # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
             positive_roi_bool = roi_iou_max >= (0.5 if cf.dim == 2 else 0.3)
             # 2. Negative ROIs are those with < 0.1 with every GT box.
             negative_roi_bool = roi_iou_max < (0.1 if cf.dim == 2 else 0.01)
         else:
             positive_roi_bool = torch.FloatTensor().cuda()
             negative_roi_bool = torch.from_numpy(np.array([1]*proposals.shape[0])).cuda()
 
         # Sample Positive ROIs
         if 0 not in torch.nonzero(positive_roi_bool).size():
             positive_indices = torch.nonzero(positive_roi_bool).squeeze(1)
             positive_samples = int(cf.train_rois_per_image * cf.roi_positive_ratio)
             rand_idx = torch.randperm(positive_indices.size()[0])
             rand_idx = rand_idx[:positive_samples].cuda()
             positive_indices = positive_indices[rand_idx]
             positive_samples = positive_indices.size()[0]
             positive_rois = proposals[positive_indices, :]
             # Assign positive ROIs to GT boxes.
             positive_overlaps = overlaps[positive_indices, :]
             roi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1]
             roi_gt_boxes = gt_boxes[roi_gt_box_assignment, :]
             roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment]
 
             # Compute bbox refinement targets for positive ROIs
             deltas = mutils.box_refinement(positive_rois, roi_gt_boxes)
-            std_dev = torch.from_numpy(cf.bbox_std_dev).float().cuda()
             deltas /= std_dev
 
             # Assign positive ROIs to GT masks
-            roi_masks = gt_masks[roi_gt_box_assignment, :, :]
+            roi_masks = gt_masks[roi_gt_box_assignment].unsqueeze(1)
+            assert roi_masks.shape[-1] == 1
 
             # Compute mask targets
             boxes = positive_rois
-            box_ids = torch.arange(roi_masks.size()[0]).int().cuda()
+            box_ids = torch.arange(roi_masks.shape[0]).cuda().unsqueeze(1).float()
 
             if len(cf.mask_shape) == 2:
-                masks = ra2D(cf.mask_shape[0], cf.mask_shape[1], 0)(roi_masks.unsqueeze(1), boxes, box_ids)
+                # todo what are the dims of roi_masks? (n_matched_boxes_with_gts, 1 (dummy channel dim), y,x, 1 (WHY?))
+                masks = roi_align.roi_align_2d(roi_masks, torch.cat((box_ids, boxes), dim=1), cf.mask_shape)
             else:
-                masks = ra3D(cf.mask_shape[0], cf.mask_shape[1], cf.mask_shape[2], 0)(roi_masks.unsqueeze(1), boxes, box_ids)
+                masks = roi_align.roi_align_3d(roi_masks, torch.cat((box_ids, boxes), dim=1), cf.mask_shape)
 
             masks = masks.squeeze(1)
             # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
             # binary cross entropy loss.
             masks = torch.round(masks)
 
             sample_positive_indices.append(batch_element_indices[positive_indices])
             sample_deltas.append(deltas)
             sample_masks.append(masks)
             sample_class_ids.append(roi_gt_class_ids)
             positive_count += positive_samples
         else:
             positive_samples = 0
 
         # Negative ROIs. Add enough to maintain positive:negative ratio, but at least 1. Sample via SHEM.
         if 0 not in torch.nonzero(negative_roi_bool).size():
             negative_indices = torch.nonzero(negative_roi_bool).squeeze(1)
             r = 1.0 / cf.roi_positive_ratio
             b_neg_count = np.max((int(r * positive_samples - positive_samples), 1))
             roi_probs_neg = batch_mrcnn_class_scores[batch_element_indices[negative_indices]]
             raw_sampled_indices = mutils.shem(roi_probs_neg, b_neg_count, cf.shem_poolsize)
             sample_negative_indices.append(batch_element_indices[negative_indices[raw_sampled_indices]])
             negative_count += raw_sampled_indices.size()[0]
 
     if len(sample_positive_indices) > 0:
         target_deltas = torch.cat(sample_deltas)
         target_masks = torch.cat(sample_masks)
         target_class_ids = torch.cat(sample_class_ids)
 
     # Pad target information with zeros for negative ROIs.
     if positive_count > 0 and negative_count > 0:
         sample_indices = torch.cat((torch.cat(sample_positive_indices), torch.cat(sample_negative_indices)), dim=0)
         zeros = torch.zeros(negative_count).int().cuda()
         target_class_ids = torch.cat([target_class_ids, zeros], dim=0)
         zeros = torch.zeros(negative_count, cf.dim * 2).cuda()
         target_deltas = torch.cat([target_deltas, zeros], dim=0)
         zeros = torch.zeros(negative_count, *cf.mask_shape).cuda()
         target_masks = torch.cat([target_masks, zeros], dim=0)
     elif positive_count > 0:
         sample_indices = torch.cat(sample_positive_indices)
     elif negative_count > 0:
         sample_indices = torch.cat(sample_negative_indices)
         zeros = torch.zeros(negative_count).int().cuda()
         target_class_ids = zeros
         zeros = torch.zeros(negative_count, cf.dim * 2).cuda()
         target_deltas = zeros
         zeros = torch.zeros(negative_count, *cf.mask_shape).cuda()
         target_masks = zeros
     else:
         sample_indices = torch.LongTensor().cuda()
         target_class_ids = torch.IntTensor().cuda()
         target_deltas = torch.FloatTensor().cuda()
         target_masks = torch.FloatTensor().cuda()
 
     return sample_indices, target_class_ids, target_deltas, target_masks
 
 
 ############################################################
 #  Output Handler
 ############################################################
 
-def refine_detections(rois, probs, deltas, batch_ixs, cf):
+# def refine_detections(rois, probs, deltas, batch_ixs, cf):
+#     """
+#     Refine classified proposals, filter overlaps and return final detections.
+#
+#     :param rois: (n_proposals, 2 * dim) normalized boxes as proposed by RPN. n_proposals = batch_size * POST_NMS_ROIS
+#     :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by mrcnn classifier.
+#     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by mrcnn bbox regressor.
+#     :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
+#     :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
+#     """
+#     # class IDs per ROI. Since scores of all classes are of interest (not just max class), all are kept at this point.
+#     class_ids = []
+#     fg_classes = cf.head_classes - 1
+#     # repeat vectors to fill in predictions for all foreground classes.
+#     for ii in range(1, fg_classes + 1):
+#         class_ids += [ii] * rois.shape[0]
+#     class_ids = torch.from_numpy(np.array(class_ids)).cuda()
+#
+#     rois = rois.repeat(fg_classes, 1)
+#     probs = probs.repeat(fg_classes, 1)
+#     deltas = deltas.repeat(fg_classes, 1, 1)
+#     batch_ixs = batch_ixs.repeat(fg_classes)
+#
+#     # get class-specific scores and  bounding box deltas
+#     idx = torch.arange(class_ids.size()[0]).long().cuda()
+#     class_scores = probs[idx, class_ids]
+#     deltas_specific = deltas[idx, class_ids]
+#     batch_ixs = batch_ixs[idx]
+#
+#     # apply bounding box deltas. re-scale to image coordinates.
+#     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
+#     scale = torch.from_numpy(cf.scale).float().cuda()
+#     refined_rois = mutils.apply_box_deltas_2D(rois, deltas_specific * std_dev) * scale if cf.dim == 2 else \
+#         mutils.apply_box_deltas_3D(rois, deltas_specific * std_dev) * scale
+#
+#     # round and cast to int since we're deadling with pixels now
+#     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
+#     refined_rois = torch.round(refined_rois)
+#
+#     # filter out low confidence boxes
+#     keep = idx
+#     keep_bool = (class_scores >= cf.model_min_confidence)
+#     if 0 not in torch.nonzero(keep_bool).size():
+#
+#         score_keep = torch.nonzero(keep_bool)[:, 0]
+#         pre_nms_class_ids = class_ids[score_keep]
+#         pre_nms_rois = refined_rois[score_keep]
+#         pre_nms_scores = class_scores[score_keep]
+#         pre_nms_batch_ixs = batch_ixs[score_keep]
+#
+#         for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
+#
+#             bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
+#             bix_class_ids = pre_nms_class_ids[bixs]
+#             bix_rois = pre_nms_rois[bixs]
+#             bix_scores = pre_nms_scores[bixs]
+#
+#             for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
+#
+#                 ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
+#                 # nms expects boxes sorted by score.
+#                 ix_rois = bix_rois[ixs]
+#                 ix_scores = bix_scores[ixs]
+#                 ix_scores, order = ix_scores.sort(descending=True)
+#                 ix_rois = ix_rois[order, :]
+#
+#                 if cf.dim == 2:
+#                     class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+#                 else:
+#                     class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+#
+#                 # map indices back.
+#                 class_keep = keep[score_keep[bixs[ixs[order[class_keep]]]]]
+#                 # merge indices over classes for current batch element
+#                 b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
+#
+#             # only keep top-k boxes of current batch-element
+#             top_ids = class_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
+#             b_keep = b_keep[top_ids]
+#
+#             # merge indices over batch elements.
+#             batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
+#
+#         keep = batch_keep
+#
+#     else:
+#         keep = torch.tensor([0]).long().cuda()
+#
+#     # arrange output
+#     result = torch.cat((refined_rois[keep],
+#                         batch_ixs[keep].unsqueeze(1),
+#                         class_ids[keep].unsqueeze(1).float(),
+#                         class_scores[keep].unsqueeze(1)), dim=1)
+#
+#     return result
+
+def refine_detections(cf, batch_ixs, rois, deltas, scores):
     """
-    Refine classified proposals, filter overlaps and return final detections.
+    Refine classified proposals (apply deltas to rpn rois), filter overlaps (nms) and return final detections.
 
     :param rois: (n_proposals, 2 * dim) normalized boxes as proposed by RPN. n_proposals = batch_size * POST_NMS_ROIS
-    :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by mrcnn classifier.
     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by mrcnn bbox regressor.
-    :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
-    :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
+    :param batch_ixs: (n_proposals) batch element assignment info for re-allocation.
+    :param scores: (n_proposals, n_classes) probabilities for all classes per roi as predicted by mrcnn classifier.
+    :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score, *regression vector features))
     """
     # class IDs per ROI. Since scores of all classes are of interest (not just max class), all are kept at this point.
     class_ids = []
     fg_classes = cf.head_classes - 1
     # repeat vectors to fill in predictions for all foreground classes.
     for ii in range(1, fg_classes + 1):
         class_ids += [ii] * rois.shape[0]
     class_ids = torch.from_numpy(np.array(class_ids)).cuda()
 
+    batch_ixs = batch_ixs.repeat(fg_classes)
     rois = rois.repeat(fg_classes, 1)
-    probs = probs.repeat(fg_classes, 1)
     deltas = deltas.repeat(fg_classes, 1, 1)
-    batch_ixs = batch_ixs.repeat(fg_classes)
+    scores = scores.repeat(fg_classes, 1)
 
     # get class-specific scores and  bounding box deltas
     idx = torch.arange(class_ids.size()[0]).long().cuda()
-    class_scores = probs[idx, class_ids]
-    deltas_specific = deltas[idx, class_ids]
+    # using idx instead of slice [:,] squashes first dimension.
+    #len(class_ids)>scores.shape[1] --> probs is broadcasted by expansion from fg_classes-->len(class_ids)
     batch_ixs = batch_ixs[idx]
+    deltas_specific = deltas[idx, class_ids]
+    class_scores = scores[idx, class_ids]
 
     # apply bounding box deltas. re-scale to image coordinates.
     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
     scale = torch.from_numpy(cf.scale).float().cuda()
     refined_rois = mutils.apply_box_deltas_2D(rois, deltas_specific * std_dev) * scale if cf.dim == 2 else \
         mutils.apply_box_deltas_3D(rois, deltas_specific * std_dev) * scale
 
-    # round and cast to int since we're deadling with pixels now
+    # round and cast to int since we're dealing with pixels now
     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
     refined_rois = torch.round(refined_rois)
 
     # filter out low confidence boxes
     keep = idx
     keep_bool = (class_scores >= cf.model_min_confidence)
-    if 0 not in torch.nonzero(keep_bool).size():
+    if not 0 in torch.nonzero(keep_bool).size():
 
         score_keep = torch.nonzero(keep_bool)[:, 0]
         pre_nms_class_ids = class_ids[score_keep]
         pre_nms_rois = refined_rois[score_keep]
         pre_nms_scores = class_scores[score_keep]
         pre_nms_batch_ixs = batch_ixs[score_keep]
 
         for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
 
             bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
             bix_class_ids = pre_nms_class_ids[bixs]
             bix_rois = pre_nms_rois[bixs]
             bix_scores = pre_nms_scores[bixs]
 
             for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
 
                 ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
                 # nms expects boxes sorted by score.
                 ix_rois = bix_rois[ixs]
                 ix_scores = bix_scores[ixs]
                 ix_scores, order = ix_scores.sort(descending=True)
                 ix_rois = ix_rois[order, :]
 
-                if cf.dim == 2:
-                    class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
-                else:
-                    class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+                class_keep = nms.nms(ix_rois, ix_scores, cf.detection_nms_threshold)
 
                 # map indices back.
                 class_keep = keep[score_keep[bixs[ixs[order[class_keep]]]]]
                 # merge indices over classes for current batch element
                 b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
 
             # only keep top-k boxes of current batch-element
             top_ids = class_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
             b_keep = b_keep[top_ids]
 
             # merge indices over batch elements.
-            batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
+            batch_keep = b_keep  if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
 
         keep = batch_keep
 
     else:
         keep = torch.tensor([0]).long().cuda()
 
     # arrange output
-    result = torch.cat((refined_rois[keep],
-                        batch_ixs[keep].unsqueeze(1),
-                        class_ids[keep].unsqueeze(1).float(),
-                        class_scores[keep].unsqueeze(1)), dim=1)
+    output = [refined_rois[keep], batch_ixs[keep].unsqueeze(1)]
+    output += [class_ids[keep].unsqueeze(1).float(), class_scores[keep].unsqueeze(1)]
 
+    result = torch.cat(output, dim=1)
+    # shape: (n_keeps, catted feats), catted feats: [0:dim*2] are box_coords, [dim*2] are batch_ics,
+    # [dim*2+1] are class_ids, [dim*2+2] are scores, [dim*2+3:] are regression vector features (incl uncertainty)
     return result
 
 
 def get_results(cf, img_shape, detections, detection_masks, box_results_list=None, return_masks=True):
     """
     Restores batch dimension of merged detections, unmolds detections, creates and fills results dict.
     :param img_shape:
     :param detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
     :param detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
     :param box_results_list: None or list of output boxes for monitoring/plotting.
     each element is a list of boxes per batch element.
     :param return_masks: boolean. If True, full resolution masks are returned for all proposals (speed trade-off).
     :return: results_dict: dictionary with keys:
              'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                       [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
              'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, 1] only fg. vs. bg for now.
              class-specific return of masks will come with implementation of instance segmentation evaluation.
     """
     detections = detections.cpu().data.numpy()
     if cf.dim == 2:
         detection_masks = detection_masks.permute(0, 2, 3, 1).cpu().data.numpy()
     else:
         detection_masks = detection_masks.permute(0, 2, 3, 4, 1).cpu().data.numpy()
 
     # restore batch dimension of merged detections using the batch_ix info.
     batch_ixs = detections[:, cf.dim*2]
     detections = [detections[batch_ixs == ix] for ix in range(img_shape[0])]
     mrcnn_mask = [detection_masks[batch_ixs == ix] for ix in range(img_shape[0])]
 
     # for test_forward, where no previous list exists.
     if box_results_list is None:
         box_results_list = [[] for _ in range(img_shape[0])]
 
     seg_preds = []
     # loop over batch and unmold detections.
     for ix in range(img_shape[0]):
 
         if 0 not in detections[ix].shape:
             boxes = detections[ix][:, :2 * cf.dim].astype(np.int32)
             class_ids = detections[ix][:, 2 * cf.dim + 1].astype(np.int32)
             scores = detections[ix][:, 2 * cf.dim + 2]
             masks = mrcnn_mask[ix][np.arange(boxes.shape[0]), ..., class_ids]
 
             # Filter out detections with zero area. Often only happens in early
             # stages of training when the network weights are still a bit random.
             if cf.dim == 2:
                 exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
             else:
                 exclude_ix = np.where(
                     (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 5] - boxes[:, 4]) <= 0)[0]
 
             if exclude_ix.shape[0] > 0:
                 boxes = np.delete(boxes, exclude_ix, axis=0)
                 class_ids = np.delete(class_ids, exclude_ix, axis=0)
                 scores = np.delete(scores, exclude_ix, axis=0)
                 masks = np.delete(masks, exclude_ix, axis=0)
 
             # Resize masks to original image size and set boundary threshold.
             full_masks = []
             permuted_image_shape = list(img_shape[2:]) + [img_shape[1]]
             if return_masks:
                 for i in range(masks.shape[0]):
                     # Convert neural network mask to full size mask.
                     full_masks.append(mutils.unmold_mask_2D(masks[i], boxes[i], permuted_image_shape)
                     if cf.dim == 2 else mutils.unmold_mask_3D(masks[i], boxes[i], permuted_image_shape))
             # if masks are returned, take max over binary full masks of all predictions in this image.
             # right now only binary masks for plotting/monitoring. for instance segmentation return all proposal maks.
             final_masks = np.max(np.array(full_masks), 0) if len(full_masks) > 0 else np.zeros(
                 (*permuted_image_shape[:-1],))
 
             # add final perdictions to results.
             if 0 not in boxes.shape:
                 for ix2, score in enumerate(scores):
                     box_results_list[ix].append({'box_coords': boxes[ix2], 'box_score': score,
                                                  'box_type': 'det', 'box_pred_class_id': class_ids[ix2]})
         else:
             # pad with zero dummy masks.
             final_masks = np.zeros(img_shape[2:])
 
         seg_preds.append(final_masks)
 
     # create and fill results dictionary.
     results_dict = {'boxes': box_results_list,
                     'seg_preds': np.round(np.array(seg_preds))[:, np.newaxis].astype('uint8')}
 
     return results_dict
 
 
 ############################################################
 #  Mask R-CNN Class
 ############################################################
 
 class net(nn.Module):
 
 
     def __init__(self, cf, logger):
 
         super(net, self).__init__()
         self.cf = cf
         self.logger = logger
         self.build()
 
         if self.cf.weight_init is not None:
             logger.info("using pytorch weight init of type {}".format(self.cf.weight_init))
             mutils.initialize_weights(self)
         else:
             logger.info("using default pytorch weight init")
 
 
     def build(self):
         """Build Mask R-CNN architecture."""
 
         # Image size must be dividable by 2 multiple times.
         h, w = self.cf.patch_size[:2]
         if h / 2**5 != int(h / 2**5) or w / 2**5 != int(w / 2**5):
             raise Exception("Image size must be dividable by 2 at least 5 times "
                             "to avoid fractions when downscaling and upscaling."
                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
         if len(self.cf.patch_size) == 3:
             d = self.cf.patch_size[2]
             if d / 2**3 != int(d / 2**3):
                 raise Exception("Image z dimension must be dividable by 2 at least 3 times "
                                 "to avoid fractions when downscaling and upscaling.")
 
 
 
         # instanciate abstract multi dimensional conv class and backbone class.
         conv = mutils.NDConvGenerator(self.cf.dim)
         backbone = utils.import_module('bbone', self.cf.backbone_path)
 
         # build Anchors, FPN, RPN, Classifier / Bbox-Regressor -head, Mask-head
         self.np_anchors = mutils.generate_pyramid_anchors(self.logger, self.cf)
         self.anchors = torch.from_numpy(self.np_anchors).float().cuda()
         self.fpn = backbone.FPN(self.cf, conv)
         self.rpn = RPN(self.cf, conv)
         self.classifier = Classifier(self.cf, conv)
         self.mask = Mask(self.cf, conv)
 
 
     def train_forward(self, batch, is_validation=False):
         """
         train method (also used for validation monitoring). wrapper around forward pass of network. prepares input data
         for processing, computes losses, and stores outputs in a dictionary.
         :param batch: dictionary containing 'data', 'seg', etc.
         :return: results_dict: dictionary with keys:
                 'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                         [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                 'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, n_classes].
                 'monitor_values': dict of values to be monitored.
         """
         img = batch['data']
         gt_class_ids = batch['roi_labels']
         gt_boxes = batch['bb_target']
         axes = (0, 2, 3, 1) if self.cf.dim == 2 else (0, 2, 3, 4, 1)
         gt_masks = [np.transpose(batch['roi_masks'][ii], axes=axes) for ii in range(len(batch['roi_masks']))]
 
 
         img = torch.from_numpy(img).float().cuda()
         batch_rpn_class_loss = torch.FloatTensor([0]).cuda()
         batch_rpn_bbox_loss = torch.FloatTensor([0]).cuda()
 
         # list of output boxes for monitoring/plotting. each element is a list of boxes per batch element.
         box_results_list = [[] for _ in range(img.shape[0])]
 
         #forward passes. 1. general forward pass, where no activations are saved in second stage (for performance
         # monitoring and loss sampling). 2. second stage forward pass of sampled rois with stored activations for backprop.
         rpn_class_logits, rpn_pred_deltas, proposal_boxes, detections, detection_masks = self.forward(img)
         mrcnn_class_logits, mrcnn_pred_deltas, mrcnn_pred_mask, target_class_ids, mrcnn_target_deltas, target_mask,  \
         sample_proposals = self.loss_samples_forward(gt_class_ids, gt_boxes, gt_masks)
 
         # loop over batch
         for b in range(img.shape[0]):
             if len(gt_boxes[b]) > 0:
 
                 # add gt boxes to output list for monitoring.
                 for ix in range(len(gt_boxes[b])):
                     box_results_list[b].append({'box_coords': batch['bb_target'][b][ix],
                                                 'box_label': batch['roi_labels'][b][ix], 'box_type': 'gt'})
 
                 # match gt boxes with anchors to generate targets for RPN losses.
                 rpn_match, rpn_target_deltas = mutils.gt_anchor_matching(self.cf, self.np_anchors, gt_boxes[b])
 
                 # add positive anchors used for loss to output list for monitoring.
                 pos_anchors = mutils.clip_boxes_numpy(self.np_anchors[np.argwhere(rpn_match == 1)][:, 0], img.shape[2:])
                 for p in pos_anchors:
                     box_results_list[b].append({'box_coords': p, 'box_type': 'pos_anchor'})
 
             else:
                 rpn_match = np.array([-1]*self.np_anchors.shape[0])
                 rpn_target_deltas = np.array([0])
 
-            rpn_match = torch.from_numpy(rpn_match).cuda()
+            rpn_match_gpu = torch.from_numpy(rpn_match).cuda()
             rpn_target_deltas = torch.from_numpy(rpn_target_deltas).float().cuda()
 
             # compute RPN losses.
-            rpn_class_loss, neg_anchor_ix = compute_rpn_class_loss(rpn_match, rpn_class_logits[b], self.cf.shem_poolsize)
-            rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas[b], rpn_match)
+            rpn_class_loss, neg_anchor_ix = compute_rpn_class_loss(rpn_match_gpu, rpn_class_logits[b], self.cf.shem_poolsize)
+            rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas[b], rpn_match_gpu)
             batch_rpn_class_loss += rpn_class_loss / img.shape[0]
             batch_rpn_bbox_loss += rpn_bbox_loss / img.shape[0]
 
             # add negative anchors used for loss to output list for monitoring.
-            neg_anchors = mutils.clip_boxes_numpy(self.np_anchors[np.argwhere(rpn_match == -1)][0, neg_anchor_ix], img.shape[2:])
+            neg_anchors = mutils.clip_boxes_numpy(self.np_anchors[rpn_match == -1][neg_anchor_ix], img.shape[2:])
             for n in neg_anchors:
                 box_results_list[b].append({'box_coords': n, 'box_type': 'neg_anchor'})
 
             # add highest scoring proposals to output list for monitoring.
             rpn_proposals = proposal_boxes[b][proposal_boxes[b, :, -1].argsort()][::-1]
             for r in rpn_proposals[:self.cf.n_plot_rpn_props, :-1]:
                 box_results_list[b].append({'box_coords': r, 'box_type': 'prop'})
 
         # add positive and negative roi samples used for mrcnn losses to output list for monitoring.
         if 0 not in sample_proposals.shape:
             rois = mutils.clip_to_window(self.cf.window, sample_proposals).cpu().data.numpy()
             for ix, r in enumerate(rois):
                 box_results_list[int(r[-1])].append({'box_coords': r[:-1] * self.cf.scale,
                                             'box_type': 'pos_class' if target_class_ids[ix] > 0 else 'neg_class'})
 
         batch_rpn_class_loss = batch_rpn_class_loss
         batch_rpn_bbox_loss = batch_rpn_bbox_loss
 
         # compute mrcnn losses.
         mrcnn_class_loss = compute_mrcnn_class_loss(target_class_ids, mrcnn_class_logits)
         mrcnn_bbox_loss = compute_mrcnn_bbox_loss(mrcnn_target_deltas, mrcnn_pred_deltas, target_class_ids)
 
         # mrcnn can be run without pixelwise annotations available (Faster R-CNN mode).
         # In this case, the mask_loss is taken out of training.
         if not self.cf.frcnn_mode:
             mrcnn_mask_loss = compute_mrcnn_mask_loss(target_mask, mrcnn_pred_mask, target_class_ids)
         else:
             mrcnn_mask_loss = torch.FloatTensor([0]).cuda()
 
         loss = batch_rpn_class_loss + batch_rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss
 
         # monitor RPN performance: detection count = the number of correctly matched proposals per fg-class.
         dcount = [list(target_class_ids.cpu().data.numpy()).count(c) for c in np.arange(self.cf.head_classes)[1:]]
 
 
 
         # run unmolding of predictions for monitoring and merge all results to one dictionary.
         return_masks = self.cf.return_masks_in_val if is_validation else False
         results_dict = get_results(self.cf, img.shape, detections, detection_masks,
                                    box_results_list, return_masks=return_masks)
 
         results_dict['torch_loss'] = loss
         results_dict['monitor_values'] = {'loss': loss.item(), 'class_loss': mrcnn_class_loss.item()}
 
         results_dict['logger_string'] =  \
             "loss: {0:.2f}, rpn_class: {1:.2f}, rpn_bbox: {2:.2f}, mrcnn_class: {3:.2f}, mrcnn_bbox: {4:.2f}, " \
             "mrcnn_mask: {5:.2f}, dcount {6}".format(loss.item(), batch_rpn_class_loss.item(),
                                                      batch_rpn_bbox_loss.item(), mrcnn_class_loss.item(),
                                                      mrcnn_bbox_loss.item(), mrcnn_mask_loss.item(), dcount)
 
         return results_dict
 
 
     def test_forward(self, batch, return_masks=True):
         """
         test method. wrapper around forward pass of network without usage of any ground truth information.
         prepares input data for processing and stores outputs in a dictionary.
         :param batch: dictionary containing 'data'
         :param return_masks: boolean. If True, full resolution masks are returned for all proposals (speed trade-off).
         :return: results_dict: dictionary with keys:
                'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                        [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, n_classes]
         """
         img = batch['data']
         img = torch.from_numpy(img).float().cuda()
         _, _, _, detections, detection_masks = self.forward(img)
         results_dict = get_results(self.cf, img.shape, detections, detection_masks, return_masks=return_masks)
         return results_dict
 
 
     def forward(self, img, is_training=True):
         """
         :param img: input images (b, c, y, x, (z)).
         :return: rpn_pred_logits: (b, n_anchors, 2)
         :return: rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
         :return: batch_proposal_boxes: (b, n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix)) only for monitoring/plotting.
         :return: detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
         :return: detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
         """
         # extract features.
         fpn_outs = self.fpn(img)
         rpn_feature_maps = [fpn_outs[i] for i in self.cf.pyramid_levels]
         self.mrcnn_feature_maps = rpn_feature_maps
 
         # loop through pyramid layers and apply RPN.
         layer_outputs = []  # list of lists
         for p in rpn_feature_maps:
             layer_outputs.append(self.rpn(p))
 
         # concatenate layer outputs.
         # convert from list of lists of level outputs to list of lists of outputs across levels.
         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
         outputs = list(zip(*layer_outputs))
         outputs = [torch.cat(list(o), dim=1) for o in outputs]
         rpn_pred_logits, rpn_pred_probs, rpn_pred_deltas = outputs
 
         # generate proposals: apply predicted deltas to anchors and filter by foreground scores from RPN classifier.
         proposal_count = self.cf.post_nms_rois_training if is_training else self.cf.post_nms_rois_inference
-        batch_rpn_rois, batch_proposal_boxes = proposal_layer(rpn_pred_probs, rpn_pred_deltas, proposal_count, self.anchors, self.cf)
+        batch_rpn_rois, batch_proposal_boxes = refine_proposals(rpn_pred_probs, rpn_pred_deltas, proposal_count, self.anchors, self.cf)
 
         # merge batch dimension of proposals while storing allocation info in coordinate dimension.
         batch_ixs = torch.from_numpy(np.repeat(np.arange(batch_rpn_rois.shape[0]), batch_rpn_rois.shape[1])).float().cuda()
         rpn_rois = batch_rpn_rois.view(-1, batch_rpn_rois.shape[2])
         self.rpn_rois_batch_info = torch.cat((rpn_rois, batch_ixs.unsqueeze(1)), dim=1)
 
         # this is the first of two forward passes in the second stage, where no activations are stored for backprop.
         # here, all proposals are forwarded (with virtual_batch_size = batch_size * post_nms_rois.)
         # for inference/monitoring as well as sampling of rois for the loss functions.
         # processed in chunks of roi_chunk_size to re-adjust to gpu-memory.
         chunked_rpn_rois = self.rpn_rois_batch_info.split(self.cf.roi_chunk_size)
         class_logits_list, bboxes_list = [], []
         with torch.no_grad():
             for chunk in chunked_rpn_rois:
                 chunk_class_logits, chunk_bboxes = self.classifier(self.mrcnn_feature_maps, chunk)
                 class_logits_list.append(chunk_class_logits)
                 bboxes_list.append(chunk_bboxes)
         batch_mrcnn_class_logits = torch.cat(class_logits_list, 0)
         batch_mrcnn_bbox = torch.cat(bboxes_list, 0)
         self.batch_mrcnn_class_scores = F.softmax(batch_mrcnn_class_logits, dim=1)
 
         # refine classified proposals, filter and return final detections.
-        detections = refine_detections(rpn_rois, self.batch_mrcnn_class_scores, batch_mrcnn_bbox, batch_ixs, self.cf, )
+        detections = refine_detections(self.cf, batch_ixs, rpn_rois, batch_mrcnn_bbox, self.batch_mrcnn_class_scores)
 
         # forward remaining detections through mask-head to generate corresponding masks.
         scale = [img.shape[2]] * 4 + [img.shape[-1]] * 2
         scale = torch.from_numpy(np.array(scale[:self.cf.dim * 2] + [1])[None]).float().cuda()
 
 
         detection_boxes = detections[:, :self.cf.dim * 2 + 1] / scale
         with torch.no_grad():
             detection_masks = self.mask(self.mrcnn_feature_maps, detection_boxes)
 
         return [rpn_pred_logits, rpn_pred_deltas, batch_proposal_boxes, detections, detection_masks]
 
 
     def loss_samples_forward(self, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks):
         """
         this is the second forward pass through the second stage (features from stage one are re-used).
         samples few rois in detection_target_layer and forwards only those for loss computation.
         :param batch_gt_class_ids: list over batch elements. Each element is a list over the corresponding roi target labels.
         :param batch_gt_boxes: list over batch elements. Each element is a list over the corresponding roi target coordinates.
         :param batch_gt_masks: list over batch elements. Each element is binary mask of shape (n_gt_rois, y, x, (z), c)
         :return: sample_logits: (n_sampled_rois, n_classes) predicted class scores.
         :return: sample_boxes: (n_sampled_rois, n_classes, 2 * dim) predicted corrections to be applied to proposals for refinement.
         :return: sample_mask: (n_sampled_rois, n_classes, y, x, (z)) predicted masks per class and proposal.
         :return: sample_target_class_ids: (n_sampled_rois) target class labels of sampled proposals.
         :return: sample_target_deltas: (n_sampled_rois, 2 * dim) target deltas of sampled proposals for box refinement.
         :return: sample_target_masks: (n_sampled_rois, y, x, (z)) target masks of sampled proposals.
         :return: sample_proposals: (n_sampled_rois, 2 * dim) RPN output for sampled proposals. only for monitoring/plotting.
         """
         # sample rois for loss and get corresponding targets for all Mask R-CNN head network losses.
         sample_ix, sample_target_class_ids, sample_target_deltas, sample_target_mask = \
             detection_target_layer(self.rpn_rois_batch_info, self.batch_mrcnn_class_scores,
                                    batch_gt_class_ids, batch_gt_boxes, batch_gt_masks, self.cf)
 
         # re-use feature maps and RPN output from first forward pass.
         sample_proposals = self.rpn_rois_batch_info[sample_ix]
         if 0 not in sample_proposals.size():
             sample_logits, sample_boxes = self.classifier(self.mrcnn_feature_maps, sample_proposals)
             sample_mask = self.mask(self.mrcnn_feature_maps, sample_proposals)
         else:
             sample_logits = torch.FloatTensor().cuda()
             sample_boxes = torch.FloatTensor().cuda()
             sample_mask = torch.FloatTensor().cuda()
 
         return [sample_logits, sample_boxes, sample_mask, sample_target_class_ids, sample_target_deltas,
                 sample_target_mask, sample_proposals]
\ No newline at end of file
diff --git a/models/retina_net.py b/models/retina_net.py
index c1c8b0a..debff99 100644
--- a/models/retina_net.py
+++ b/models/retina_net.py
@@ -1,508 +1,504 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Retina Net. According to https://arxiv.org/abs/1708.02002
 Retina U-Net. According to https://arxiv.org/abs/1811.08661
 """
 
 import utils.model_utils as mutils
 import utils.exp_utils as utils
 import sys
-sys.path.append('../')
-from cuda_functions.nms_2D.pth_nms import nms_gpu as nms_2D
-from cuda_functions.nms_3D.pth_nms import nms_gpu as nms_3D
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils
 
+sys.path.append('..')
+from custom_extensions.nms import nms
+
 
 ############################################################
 #  Network Heads
 ############################################################
 
 class Classifier(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the classifier sub-network.
         """
         super(Classifier, self).__init__()
         self.dim = conv.dim
         self.n_classes = cf.head_classes
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * cf.head_classes
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride, pad=1, relu=None)
 
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: class_logits (b, n_anchors, n_classes)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         class_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         class_logits = class_logits.permute(*axes)
         class_logits = class_logits.contiguous()
         class_logits = class_logits.view(x.size()[0], -1, self.n_classes)
 
         return [class_logits]
 
 
 
 class BBRegressor(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the bb-regression sub-network.
         """
         super(BBRegressor, self).__init__()
         self.dim = conv.dim
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * self.dim * 2
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride,
                                pad=1, relu=None)
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: bb_logits (b, n_anchors, dim * 2)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         bb_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         bb_logits = bb_logits.permute(*axes)
         bb_logits = bb_logits.contiguous()
         bb_logits = bb_logits.view(x.size()[0], -1, self.dim * 2)
 
         return [bb_logits]
 
 
 ############################################################
 #  Loss Functions
 ############################################################
 
 def compute_class_loss(anchor_matches, class_pred_logits, shem_poolsize=20):
     """
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :param class_pred_logits: (n_anchors, n_classes). logits from classifier sub-network.
     :param shem_poolsize: int. factor of top-k candidates to draw from per negative sample (online-hard-example-mining).
     :return: loss: torch tensor.
     :return: np_neg_ix: 1D array containing indices of the neg_roi_logits, which have been sampled for training.
     """
     # Positive and Negative anchors contribute to the loss,
     # but neutral anchors (match value = 0) don't.
     pos_indices = torch.nonzero(anchor_matches > 0)
     neg_indices = torch.nonzero(anchor_matches == -1)
 
     # get positive samples and calucalte loss.
     if 0 not in pos_indices.size():
         pos_indices = pos_indices.squeeze(1)
         roi_logits_pos = class_pred_logits[pos_indices]
         targets_pos = anchor_matches[pos_indices]
         pos_loss = F.cross_entropy(roi_logits_pos, targets_pos.long())
     else:
         pos_loss = torch.FloatTensor([0]).cuda()
 
     # get negative samples, such that the amount matches the number of positive samples, but at least 1.
     # get high scoring negatives by applying online-hard-example-mining.
     if 0 not in neg_indices.size():
         neg_indices = neg_indices.squeeze(1)
         roi_logits_neg = class_pred_logits[neg_indices]
         negative_count = np.max((1, pos_indices.size()[0]))
         roi_probs_neg = F.softmax(roi_logits_neg, dim=1)
         neg_ix = mutils.shem(roi_probs_neg, negative_count, shem_poolsize)
         neg_loss = F.cross_entropy(roi_logits_neg[neg_ix], torch.LongTensor([0] * neg_ix.shape[0]).cuda())
         # return the indices of negative samples, which contributed to the loss (for monitoring plots).
         np_neg_ix = neg_ix.cpu().data.numpy()
     else:
         neg_loss = torch.FloatTensor([0]).cuda()
         np_neg_ix = np.array([]).astype('int32')
 
     loss = (pos_loss + neg_loss) / 2
     return loss, np_neg_ix
 
 
 def compute_bbox_loss(target_deltas, pred_deltas, anchor_matches):
     """
     :param target_deltas:   (b, n_positive_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd)))).
     Uses 0 padding to fill in unsed bbox deltas.
     :param pred_deltas: predicted deltas from bbox regression head. (b, n_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd))))
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(anchor_matches > 0).size():
 
         indices = torch.nonzero(anchor_matches > 0).squeeze(1)
         # Pick bbox deltas that contribute to the loss
         pred_deltas = pred_deltas[indices]
         # Trim target bounding box deltas to the same length as pred_deltas.
         target_deltas = target_deltas[:pred_deltas.size()[0], :]
         # Smooth L1 loss
         loss = F.smooth_l1_loss(pred_deltas, target_deltas)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 ############################################################
 #  Output Handler
 ############################################################
 
 def refine_detections(anchors, probs, deltas, batch_ixs, cf):
     """
     Refine classified proposals, filter overlaps and return final
     detections. n_proposals here is typically a very large number: batch_size * n_anchors.
     This function is hence optimized on trimming down n_proposals.
     :param anchors: (n_anchors, 2 * dim)
     :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by classifier head.
     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by bbox regressor head.
     :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
     :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
     """
-    anchors = anchors.repeat(len(np.unique(batch_ixs)), 1)
+    anchors = anchors.repeat(batch_ixs.unique().shape[0], 1)
 
     # flatten foreground probabilities, sort and trim down to highest confidences by pre_nms limit.
     fg_probs = probs[:, 1:].contiguous()
     flat_probs, flat_probs_order = fg_probs.view(-1).sort(descending=True)
     keep_ix = flat_probs_order[:cf.pre_nms_limit]
     # reshape indices to 2D index array with shape like fg_probs.
     keep_arr = torch.cat(((keep_ix / fg_probs.shape[1]).unsqueeze(1), (keep_ix % fg_probs.shape[1]).unsqueeze(1)), 1)
 
     pre_nms_scores = flat_probs[:cf.pre_nms_limit]
     pre_nms_class_ids = keep_arr[:, 1] + 1  # add background again.
     pre_nms_batch_ixs = batch_ixs[keep_arr[:, 0]]
     pre_nms_anchors = anchors[keep_arr[:, 0]]
     pre_nms_deltas = deltas[keep_arr[:, 0]]
     keep = torch.arange(pre_nms_scores.size()[0]).long().cuda()
 
     # apply bounding box deltas. re-scale to image coordinates.
     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
     scale = torch.from_numpy(cf.scale).float().cuda()
     refined_rois = mutils.apply_box_deltas_2D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale \
         if cf.dim == 2 else mutils.apply_box_deltas_3D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale
 
     # round and cast to int since we're deadling with pixels now
     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
     pre_nms_rois = torch.round(refined_rois)
     for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
 
         bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
         bix_class_ids = pre_nms_class_ids[bixs]
         bix_rois = pre_nms_rois[bixs]
         bix_scores = pre_nms_scores[bixs]
 
         for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
 
             ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
             # nms expects boxes sorted by score.
             ix_rois = bix_rois[ixs]
             ix_scores = bix_scores[ixs]
             ix_scores, order = ix_scores.sort(descending=True)
             ix_rois = ix_rois[order, :]
             ix_scores = ix_scores
 
-            if cf.dim == 2:
-                class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
-            else:
-                class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+            class_keep = nms.nms(ix_rois, ix_scores, cf.detection_nms_threshold)
 
             # map indices back.
             class_keep = keep[bixs[ixs[order[class_keep]]]]
             # merge indices over classes for current batch element
             b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
 
         # only keep top-k boxes of current batch-element.
         top_ids = pre_nms_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
         b_keep = b_keep[top_ids]
         # merge indices over batch elements.
         batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
 
     keep = batch_keep
 
     # arrange output.
     result = torch.cat((pre_nms_rois[keep],
                         pre_nms_batch_ixs[keep].unsqueeze(1).float(),
                         pre_nms_class_ids[keep].unsqueeze(1).float(),
                         pre_nms_scores[keep].unsqueeze(1)), dim=1)
 
     return result
 
 
 
 def get_results(cf, img_shape, detections, seg_logits, box_results_list=None):
     """
     Restores batch dimension of merged detections, unmolds detections, creates and fills results dict.
     :param img_shape:
     :param detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
     :param box_results_list: None or list of output boxes for monitoring/plotting.
     each element is a list of boxes per batch element.
     :return: results_dict: dictionary with keys:
              'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                       [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
              'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                           retina_unet and dummy array for retina_net.
     """
     detections = detections.cpu().data.numpy()
     batch_ixs = detections[:, cf.dim*2]
     detections = [detections[batch_ixs == ix] for ix in range(img_shape[0])]
 
     # for test_forward, where no previous list exists.
     if box_results_list is None:
         box_results_list = [[] for _ in range(img_shape[0])]
 
     for ix in range(img_shape[0]):
 
         if 0 not in detections[ix].shape:
 
             boxes = detections[ix][:, :2 * cf.dim].astype(np.int32)
             class_ids = detections[ix][:, 2 * cf.dim + 1].astype(np.int32)
             scores = detections[ix][:, 2 * cf.dim + 2]
 
             # Filter out detections with zero area. Often only happens in early
             # stages of training when the network weights are still a bit random.
             if cf.dim == 2:
                 exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
             else:
                 exclude_ix = np.where(
                     (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 5] - boxes[:, 4]) <= 0)[0]
 
             if exclude_ix.shape[0] > 0:
                 boxes = np.delete(boxes, exclude_ix, axis=0)
                 class_ids = np.delete(class_ids, exclude_ix, axis=0)
                 scores = np.delete(scores, exclude_ix, axis=0)
 
             if 0 not in boxes.shape:
                 for ix2, score in enumerate(scores):
                     if score >= cf.model_min_confidence:
                         box_results_list[ix].append({'box_coords': boxes[ix2],
                                                      'box_score': score,
                                                      'box_type': 'det',
                                                      'box_pred_class_id': class_ids[ix2]})
 
     results_dict = {'boxes': box_results_list}
     if seg_logits is None:
         # output dummy segmentation for retina_net.
         results_dict['seg_preds'] = np.zeros(img_shape)[:, 0][:, np.newaxis]
     else:
         # output label maps for retina_unet.
         results_dict['seg_preds'] = F.softmax(seg_logits, 1).argmax(1).cpu().data.numpy()[:, np.newaxis].astype('uint8')
 
     return results_dict
 
 
 ############################################################
 #  Retina (U-)Net Class
 ############################################################
 
 
 class net(nn.Module):
 
 
     def __init__(self, cf, logger):
 
         super(net, self).__init__()
         self.cf = cf
         self.logger = logger
         self.build()
         if self.cf.weight_init is not None:
             logger.info("using pytorch weight init of type {}".format(self.cf.weight_init))
             mutils.initialize_weights(self)
         else:
             logger.info("using default pytorch weight init")
 
     def build(self):
         """
         Build Retina Net architecture.
         """
 
         # Image size must be dividable by 2 multiple times.
         h, w = self.cf.patch_size[:2]
         if h / 2 ** 5 != int(h / 2 ** 5) or w / 2 ** 5 != int(w / 2 ** 5):
             raise Exception("Image size must be dividable by 2 at least 5 times "
                             "to avoid fractions when downscaling and upscaling."
                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
 
         # instanciate abstract multi dimensional conv class and backbone model.
         conv = mutils.NDConvGenerator(self.cf.dim)
         backbone = utils.import_module('bbone', self.cf.backbone_path)
 
         # build Anchors, FPN, Classifier / Bbox-Regressor -head
         self.np_anchors = mutils.generate_pyramid_anchors(self.logger, self.cf)
         self.anchors = torch.from_numpy(self.np_anchors).float().cuda()
         self.Fpn = backbone.FPN(self.cf, conv, operate_stride1=self.cf.operate_stride1)
         self.Classifier = Classifier(self.cf, conv)
         self.BBRegressor = BBRegressor(self.cf, conv)
 
 
     def train_forward(self, batch, **kwargs):
         """
         train method (also used for validation monitoring). wrapper around forward pass of network. prepares input data
         for processing, computes losses, and stores outputs in a dictionary.
         :param batch: dictionary containing 'data', 'seg', etc.
         :return: results_dict: dictionary with keys:
                 'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                         [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                 'seg_preds': pixelwise segmentation output (b, c, y, x, (z)) with values [0, .., n_classes].
                 'monitor_values': dict of values to be monitored.
         """
         img = batch['data']
         gt_class_ids = batch['roi_labels']
         gt_boxes = batch['bb_target']
-        var_seg_ohe = torch.FloatTensor(mutils.get_one_hot_encoding(batch['seg'], self.cf.num_seg_classes)).cuda()
-        var_seg = torch.LongTensor(batch['seg']).cuda()
 
         img = torch.from_numpy(img).float().cuda()
         batch_class_loss = torch.FloatTensor([0]).cuda()
         batch_bbox_loss = torch.FloatTensor([0]).cuda()
 
         # list of output boxes for monitoring/plotting. each element is a list of boxes per batch element.
         box_results_list = [[] for _ in range(img.shape[0])]
         detections, class_logits, pred_deltas, seg_logits = self.forward(img)
 
         # loop over batch
         for b in range(img.shape[0]):
 
             # add gt boxes to results dict for monitoring.
             if len(gt_boxes[b]) > 0:
                 for ix in range(len(gt_boxes[b])):
                     box_results_list[b].append({'box_coords': batch['bb_target'][b][ix],
                                                 'box_label': batch['roi_labels'][b][ix], 'box_type': 'gt'})
 
                 # match gt boxes with anchors to generate targets.
                 anchor_class_match, anchor_target_deltas = mutils.gt_anchor_matching(
                     self.cf, self.np_anchors, gt_boxes[b], gt_class_ids[b])
 
                 # add positive anchors used for loss to results_dict for monitoring.
                 pos_anchors = mutils.clip_boxes_numpy(
                     self.np_anchors[np.argwhere(anchor_class_match > 0)][:, 0], img.shape[2:])
                 for p in pos_anchors:
                     box_results_list[b].append({'box_coords': p, 'box_type': 'pos_anchor'})
 
             else:
                 anchor_class_match = np.array([-1]*self.np_anchors.shape[0])
                 anchor_target_deltas = np.array([0])
 
             anchor_class_match = torch.from_numpy(anchor_class_match).cuda()
             anchor_target_deltas = torch.from_numpy(anchor_target_deltas).float().cuda()
 
             # compute losses.
             class_loss, neg_anchor_ix = compute_class_loss(anchor_class_match, class_logits[b])
             bbox_loss = compute_bbox_loss(anchor_target_deltas, pred_deltas[b], anchor_class_match)
 
             # add negative anchors used for loss to results_dict for monitoring.
             neg_anchors = mutils.clip_boxes_numpy(
-                self.np_anchors[np.argwhere(anchor_class_match == -1)][0, neg_anchor_ix], img.shape[2:])
+                self.np_anchors[np.argwhere(anchor_class_match.cpu().numpy() == -1)][neg_anchor_ix, 0],
+                img.shape[2:])
             for n in neg_anchors:
                 box_results_list[b].append({'box_coords': n, 'box_type': 'neg_anchor'})
 
             batch_class_loss += class_loss / img.shape[0]
             batch_bbox_loss += bbox_loss / img.shape[0]
 
         results_dict = get_results(self.cf, img.shape, detections, seg_logits, box_results_list)
         loss = batch_class_loss + batch_bbox_loss
         results_dict['torch_loss'] = loss
         results_dict['monitor_values'] = {'loss': loss.item(), 'class_loss': batch_class_loss.item()}
         results_dict['logger_string'] = "loss: {0:.2f}, class: {1:.2f}, bbox: {2:.2f}"\
             .format(loss.item(), batch_class_loss.item(), batch_bbox_loss.item())
 
         return results_dict
 
 
     def test_forward(self, batch, **kwargs):
         """
         test method. wrapper around forward pass of network without usage of any ground truth information.
         prepares input data for processing and stores outputs in a dictionary.
         :param batch: dictionary containing 'data'
         :return: results_dict: dictionary with keys:
                'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                        [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                             retina_unet and dummy array for retina_net.
         """
         img = batch['data']
         img = torch.from_numpy(img).float().cuda()
         detections, _, _, seg_logits = self.forward(img)
         results_dict = get_results(self.cf, img.shape, detections, seg_logits)
         return results_dict
 
 
     def forward(self, img):
         """
         forward pass of the model.
         :param img: input img (b, c, y, x, (z)).
         :return: rpn_pred_logits: (b, n_anchors, 2)
         :return: rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
         :return: batch_proposal_boxes: (b, n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix)) only for monitoring/plotting.
         :return: detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
         :return: detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
         """
         # Feature extraction
         fpn_outs = self.Fpn(img)
         seg_logits = None
         selected_fmaps = [fpn_outs[i] for i in self.cf.pyramid_levels]
 
         # Loop through pyramid layers
         class_layer_outputs, bb_reg_layer_outputs = [], []  # list of lists
         for p in selected_fmaps:
             class_layer_outputs.append(self.Classifier(p))
             bb_reg_layer_outputs.append(self.BBRegressor(p))
 
         # Concatenate layer outputs
         # Convert from list of lists of level outputs to list of lists
         # of outputs across levels.
         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
         class_logits = list(zip(*class_layer_outputs))
         class_logits = [torch.cat(list(o), dim=1) for o in class_logits][0]
         bb_outputs = list(zip(*bb_reg_layer_outputs))
         bb_outputs = [torch.cat(list(o), dim=1) for o in bb_outputs][0]
 
         # merge batch_dimension and store info in batch_ixs for re-allocation.
         batch_ixs = torch.arange(class_logits.shape[0]).unsqueeze(1).repeat(1, class_logits.shape[1]).view(-1).cuda()
         flat_class_softmax = F.softmax(class_logits.view(-1, class_logits.shape[-1]), 1)
         flat_bb_outputs = bb_outputs.view(-1, bb_outputs.shape[-1])
         detections = refine_detections(self.anchors, flat_class_softmax, flat_bb_outputs, batch_ixs, self.cf)
 
         return detections, class_logits, bb_outputs, seg_logits
diff --git a/models/retina_unet.py b/models/retina_unet.py
index 1eec628..a0d6c17 100644
--- a/models/retina_unet.py
+++ b/models/retina_unet.py
@@ -1,513 +1,510 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Retina Net. According to https://arxiv.org/abs/1708.02002
 Retina U-Net. According to https://arxiv.org/abs/1811.08661
 """
 
 import utils.model_utils as mutils
 import utils.exp_utils as utils
 import sys
-sys.path.append('../')
-from cuda_functions.nms_2D.pth_nms import nms_gpu as nms_2D
-from cuda_functions.nms_3D.pth_nms import nms_gpu as nms_3D
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils
 
+sys.path.append('../')
+from custom_extensions.nms import nms
 
 ############################################################
 #  Network Heads
 ############################################################
 
 class Classifier(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the classifier sub-network.
         """
         super(Classifier, self).__init__()
         self.dim = conv.dim
         self.n_classes = cf.head_classes
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * cf.head_classes
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride, pad=1, relu=None)
 
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: class_logits (b, n_anchors, n_classes)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         class_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         class_logits = class_logits.permute(*axes)
         class_logits = class_logits.contiguous()
         class_logits = class_logits.view(x.size()[0], -1, self.n_classes)
 
         return [class_logits]
 
 
 
 class BBRegressor(nn.Module):
 
 
     def __init__(self, cf, conv):
         """
         Builds the bb-regression sub-network.
         """
         super(BBRegressor, self).__init__()
         self.dim = conv.dim
         n_input_channels = cf.end_filts
         n_features = cf.n_rpn_features
         n_output_channels = cf.n_anchors_per_pos * self.dim * 2
         anchor_stride = cf.rpn_anchor_stride
 
         self.conv_1 = conv(n_input_channels, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_2 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_3 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_4 = conv(n_features, n_features, ks=3, stride=anchor_stride, pad=1, relu=cf.relu)
         self.conv_final = conv(n_features, n_output_channels, ks=3, stride=anchor_stride,
                                pad=1, relu=None)
 
     def forward(self, x):
         """
         :param x: input feature map (b, in_c, y, x, (z))
         :return: bb_logits (b, n_anchors, dim * 2)
         """
         x = self.conv_1(x)
         x = self.conv_2(x)
         x = self.conv_3(x)
         x = self.conv_4(x)
         bb_logits = self.conv_final(x)
 
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         bb_logits = bb_logits.permute(*axes)
         bb_logits = bb_logits.contiguous()
         bb_logits = bb_logits.view(x.size()[0], -1, self.dim * 2)
 
         return [bb_logits]
 
 
 ############################################################
 #  Loss Functions
 ############################################################
 
 def compute_class_loss(anchor_matches, class_pred_logits, shem_poolsize=20):
     """
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :param class_pred_logits: (n_anchors, n_classes). logits from classifier sub-network.
     :param shem_poolsize: int. factor of top-k candidates to draw from per negative sample (online-hard-example-mining).
     :return: loss: torch tensor.
     :return: np_neg_ix: 1D array containing indices of the neg_roi_logits, which have been sampled for training.
     """
     # Positive and Negative anchors contribute to the loss,
     # but neutral anchors (match value = 0) don't.
     pos_indices = torch.nonzero(anchor_matches > 0)
     neg_indices = torch.nonzero(anchor_matches == -1)
 
     # get positive samples and calucalte loss.
     if 0 not in pos_indices.size():
         pos_indices = pos_indices.squeeze(1)
         roi_logits_pos = class_pred_logits[pos_indices]
         targets_pos = anchor_matches[pos_indices]
         pos_loss = F.cross_entropy(roi_logits_pos, targets_pos.long())
     else:
         pos_loss = torch.FloatTensor([0]).cuda()
 
     # get negative samples, such that the amount matches the number of positive samples, but at least 1.
     # get high scoring negatives by applying online-hard-example-mining.
     if 0 not in neg_indices.size():
         neg_indices = neg_indices.squeeze(1)
         roi_logits_neg = class_pred_logits[neg_indices]
         negative_count = np.max((1, pos_indices.size()[0]))
         roi_probs_neg = F.softmax(roi_logits_neg, dim=1)
         neg_ix = mutils.shem(roi_probs_neg, negative_count, shem_poolsize)
         neg_loss = F.cross_entropy(roi_logits_neg[neg_ix], torch.LongTensor([0] * neg_ix.shape[0]).cuda())
         # return the indices of negative samples, which contributed to the loss (for monitoring plots).
         np_neg_ix = neg_ix.cpu().data.numpy()
     else:
         neg_loss = torch.FloatTensor([0]).cuda()
         np_neg_ix = np.array([]).astype('int32')
 
     loss = (pos_loss + neg_loss) / 2
     return loss, np_neg_ix
 
 
 def compute_bbox_loss(target_deltas, pred_deltas, anchor_matches):
     """
     :param target_deltas:   (b, n_positive_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd)))).
     Uses 0 padding to fill in unsed bbox deltas.
     :param pred_deltas: predicted deltas from bbox regression head. (b, n_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd))))
     :param anchor_matches: (n_anchors). [-1, 0, class_id] for negative, neutral, and positive matched anchors.
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(anchor_matches > 0).size():
 
         indices = torch.nonzero(anchor_matches > 0).squeeze(1)
         # Pick bbox deltas that contribute to the loss
         pred_deltas = pred_deltas[indices]
         # Trim target bounding box deltas to the same length as pred_deltas.
         target_deltas = target_deltas[:pred_deltas.size()[0], :]
         # Smooth L1 loss
         loss = F.smooth_l1_loss(pred_deltas, target_deltas)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 ############################################################
 #  Output Handler
 ############################################################
 
 def refine_detections(anchors, probs, deltas, batch_ixs, cf):
     """
     Refine classified proposals, filter overlaps and return final
     detections. n_proposals here is typically a very large number: batch_size * n_anchors.
     This function is hence optimized on trimming down n_proposals.
     :param anchors: (n_anchors, 2 * dim)
     :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by classifier head.
     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by bbox regressor head.
     :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
     :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
     """
-    anchors = anchors.repeat(len(np.unique(batch_ixs)), 1)
+    anchors = anchors.repeat(batch_ixs.unique().shape[0], 1)
 
     # flatten foreground probabilities, sort and trim down to highest confidences by pre_nms limit.
     fg_probs = probs[:, 1:].contiguous()
     flat_probs, flat_probs_order = fg_probs.view(-1).sort(descending=True)
     keep_ix = flat_probs_order[:cf.pre_nms_limit]
     # reshape indices to 2D index array with shape like fg_probs.
     keep_arr = torch.cat(((keep_ix / fg_probs.shape[1]).unsqueeze(1), (keep_ix % fg_probs.shape[1]).unsqueeze(1)), 1)
 
     pre_nms_scores = flat_probs[:cf.pre_nms_limit]
     pre_nms_class_ids = keep_arr[:, 1] + 1  # add background again.
     pre_nms_batch_ixs = batch_ixs[keep_arr[:, 0]]
     pre_nms_anchors = anchors[keep_arr[:, 0]]
     pre_nms_deltas = deltas[keep_arr[:, 0]]
     keep = torch.arange(pre_nms_scores.size()[0]).long().cuda()
 
     # apply bounding box deltas. re-scale to image coordinates.
     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
     scale = torch.from_numpy(cf.scale).float().cuda()
     refined_rois = mutils.apply_box_deltas_2D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale \
         if cf.dim == 2 else mutils.apply_box_deltas_3D(pre_nms_anchors / scale, pre_nms_deltas * std_dev) * scale
 
     # round and cast to int since we're deadling with pixels now
     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
     pre_nms_rois = torch.round(refined_rois)
     for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
 
         bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
         bix_class_ids = pre_nms_class_ids[bixs]
         bix_rois = pre_nms_rois[bixs]
         bix_scores = pre_nms_scores[bixs]
 
         for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
 
             ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
             # nms expects boxes sorted by score.
             ix_rois = bix_rois[ixs]
             ix_scores = bix_scores[ixs]
             ix_scores, order = ix_scores.sort(descending=True)
             ix_rois = ix_rois[order, :]
             ix_scores = ix_scores
 
-            if cf.dim == 2:
-                class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
-            else:
-                class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+            class_keep = nms.nms(ix_rois, ix_scores, cf.detection_nms_threshold)
 
             # map indices back.
             class_keep = keep[bixs[ixs[order[class_keep]]]]
             # merge indices over classes for current batch element
             b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
 
         # only keep top-k boxes of current batch-element.
         top_ids = pre_nms_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
         b_keep = b_keep[top_ids]
         # merge indices over batch elements.
         batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
 
     keep = batch_keep
 
     # arrange output.
     result = torch.cat((pre_nms_rois[keep],
                         pre_nms_batch_ixs[keep].unsqueeze(1).float(),
                         pre_nms_class_ids[keep].unsqueeze(1).float(),
                         pre_nms_scores[keep].unsqueeze(1)), dim=1)
 
     return result
 
 
 
 def get_results(cf, img_shape, detections, seg_logits, box_results_list=None):
     """
     Restores batch dimension of merged detections, unmolds detections, creates and fills results dict.
     :param img_shape:
     :param detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
     :param box_results_list: None or list of output boxes for monitoring/plotting.
     each element is a list of boxes per batch element.
     :return: results_dict: dictionary with keys:
              'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                       [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
              'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                           retina_unet and dummy array for retina_net.
     """
     detections = detections.cpu().data.numpy()
     batch_ixs = detections[:, cf.dim*2]
     detections = [detections[batch_ixs == ix] for ix in range(img_shape[0])]
 
     # for test_forward, where no previous list exists.
     if box_results_list is None:
         box_results_list = [[] for _ in range(img_shape[0])]
 
     for ix in range(img_shape[0]):
 
         if 0 not in detections[ix].shape:
 
             boxes = detections[ix][:, :2 * cf.dim].astype(np.int32)
             class_ids = detections[ix][:, 2 * cf.dim + 1].astype(np.int32)
             scores = detections[ix][:, 2 * cf.dim + 2]
 
             # Filter out detections with zero area. Often only happens in early
             # stages of training when the network weights are still a bit random.
             if cf.dim == 2:
                 exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
             else:
                 exclude_ix = np.where(
                     (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 5] - boxes[:, 4]) <= 0)[0]
 
             if exclude_ix.shape[0] > 0:
                 boxes = np.delete(boxes, exclude_ix, axis=0)
                 class_ids = np.delete(class_ids, exclude_ix, axis=0)
                 scores = np.delete(scores, exclude_ix, axis=0)
 
             if 0 not in boxes.shape:
                 for ix2, score in enumerate(scores):
                     if score >= cf.model_min_confidence:
                         box_results_list[ix].append({'box_coords': boxes[ix2],
                                                      'box_score': score,
                                                      'box_type': 'det',
                                                      'box_pred_class_id': class_ids[ix2]})
 
     results_dict = {'boxes': box_results_list}
     if seg_logits is None:
         # output dummy segmentation for retina_net.
         results_dict['seg_preds'] = np.zeros(img_shape)[:, 0][:, np.newaxis]
     else:
         # output label maps for retina_unet.
         results_dict['seg_preds'] = F.softmax(seg_logits, 1).argmax(1).cpu().data.numpy()[:, np.newaxis].astype('uint8')
 
     return results_dict
 
 
 ############################################################
 #  Retina (U-)Net Class
 ############################################################
 
 
 class net(nn.Module):
 
 
     def __init__(self, cf, logger):
 
         super(net, self).__init__()
         self.cf = cf
         self.logger = logger
         self.build()
         if self.cf.weight_init is not None:
             logger.info("using pytorch weight init of type {}".format(self.cf.weight_init))
             mutils.initialize_weights(self)
         else:
             logger.info("using default pytorch weight init")
 
     def build(self):
         """
         Build Retina Net architecture.
         """
 
         # Image size must be dividable by 2 multiple times.
         h, w = self.cf.patch_size[:2]
         if h / 2 ** 5 != int(h / 2 ** 5) or w / 2 ** 5 != int(w / 2 ** 5):
             raise Exception("Image size must be dividable by 2 at least 5 times "
                             "to avoid fractions when downscaling and upscaling."
                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
 
         # instanciate abstract multi dimensional conv class and backbone model.
         conv = mutils.NDConvGenerator(self.cf.dim)
         backbone = utils.import_module('bbone', self.cf.backbone_path)
 
         # build Anchors, FPN, Classifier / Bbox-Regressor -head
         self.np_anchors = mutils.generate_pyramid_anchors(self.logger, self.cf)
         self.anchors = torch.from_numpy(self.np_anchors).float().cuda()
         self.Fpn = backbone.FPN(self.cf, conv, operate_stride1=self.cf.operate_stride1)
         self.Classifier = Classifier(self.cf, conv)
         self.BBRegressor = BBRegressor(self.cf, conv)
         self.final_conv = conv(self.cf.end_filts, self.cf.num_seg_classes, ks=1, pad=0, norm=None, relu=None)
 
 
     def train_forward(self, batch, **kwargs):
         """
         train method (also used for validation monitoring). wrapper around forward pass of network. prepares input data
         for processing, computes losses, and stores outputs in a dictionary.
         :param batch: dictionary containing 'data', 'seg', etc.
         :return: results_dict: dictionary with keys:
                 'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                         [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                 'seg_preds': pixelwise segmentation output (b, c, y, x, (z)) with values [0, .., n_classes].
                 'monitor_values': dict of values to be monitored.
         """
         img = batch['data']
         gt_class_ids = batch['roi_labels']
         gt_boxes = batch['bb_target']
         var_seg_ohe = torch.FloatTensor(mutils.get_one_hot_encoding(batch['seg'], self.cf.num_seg_classes)).cuda()
         var_seg = torch.LongTensor(batch['seg']).cuda()
 
         img = torch.from_numpy(img).float().cuda()
         batch_class_loss = torch.FloatTensor([0]).cuda()
         batch_bbox_loss = torch.FloatTensor([0]).cuda()
 
         # list of output boxes for monitoring/plotting. each element is a list of boxes per batch element.
         box_results_list = [[] for _ in range(img.shape[0])]
         detections, class_logits, pred_deltas, seg_logits = self.forward(img)
 
         # loop over batch
         for b in range(img.shape[0]):
 
             # add gt boxes to results dict for monitoring.
             if len(gt_boxes[b]) > 0:
                 for ix in range(len(gt_boxes[b])):
                     box_results_list[b].append({'box_coords': batch['bb_target'][b][ix],
                                                 'box_label': batch['roi_labels'][b][ix], 'box_type': 'gt'})
 
                 # match gt boxes with anchors to generate targets.
                 anchor_class_match, anchor_target_deltas = mutils.gt_anchor_matching(
                     self.cf, self.np_anchors, gt_boxes[b], gt_class_ids[b])
 
                 # add positive anchors used for loss to results_dict for monitoring.
                 pos_anchors = mutils.clip_boxes_numpy(
                     self.np_anchors[np.argwhere(anchor_class_match > 0)][:, 0], img.shape[2:])
                 for p in pos_anchors:
                     box_results_list[b].append({'box_coords': p, 'box_type': 'pos_anchor'})
 
             else:
                 anchor_class_match = np.array([-1]*self.np_anchors.shape[0])
                 anchor_target_deltas = np.array([0])
 
             anchor_class_match = torch.from_numpy(anchor_class_match).cuda()
             anchor_target_deltas = torch.from_numpy(anchor_target_deltas).float().cuda()
 
             # compute losses.
             class_loss, neg_anchor_ix = compute_class_loss(anchor_class_match, class_logits[b])
             bbox_loss = compute_bbox_loss(anchor_target_deltas, pred_deltas[b], anchor_class_match)
 
             # add negative anchors used for loss to results_dict for monitoring.
             neg_anchors = mutils.clip_boxes_numpy(
-                self.np_anchors[np.argwhere(anchor_class_match == -1)][0, neg_anchor_ix], img.shape[2:])
+                self.np_anchors[np.argwhere(anchor_class_match.cpu().numpy() == -1)][neg_anchor_ix, 0],
+                img.shape[2:])
             for n in neg_anchors:
                 box_results_list[b].append({'box_coords': n, 'box_type': 'neg_anchor'})
 
             batch_class_loss += class_loss / img.shape[0]
             batch_bbox_loss += bbox_loss / img.shape[0]
 
         results_dict = get_results(self.cf, img.shape, detections, seg_logits, box_results_list)
         seg_loss_dice = 1 - mutils.batch_dice(F.softmax(seg_logits, dim=1),var_seg_ohe)
         seg_loss_ce = F.cross_entropy(seg_logits, var_seg[:, 0])
         loss = batch_class_loss + batch_bbox_loss + (seg_loss_dice + seg_loss_ce) / 2
         results_dict['torch_loss'] = loss
         results_dict['monitor_values'] = {'loss': loss.item(), 'class_loss': batch_class_loss.item()}
         results_dict['logger_string'] = \
             "loss: {0:.2f}, class: {1:.2f}, bbox: {2:.2f}, seg dice: {3:.3f}, seg ce: {4:.3f}, mean pix. pr.: {5:.5f}"\
             .format(loss.item(), batch_class_loss.item(), batch_bbox_loss.item(), seg_loss_dice.item(),
                     seg_loss_ce.item(), np.mean(results_dict['seg_preds']))
 
         return results_dict
 
 
     def test_forward(self, batch, **kwargs):
         """
         test method. wrapper around forward pass of network without usage of any ground truth information.
         prepares input data for processing and stores outputs in a dictionary.
         :param batch: dictionary containing 'data'
         :return: results_dict: dictionary with keys:
                'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                        [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, ..., n_classes] for
                             retina_unet and dummy array for retina_net.
         """
         img = batch['data']
         img = torch.from_numpy(img).float().cuda()
         detections, _, _, seg_logits = self.forward(img)
         results_dict = get_results(self.cf, img.shape, detections, seg_logits)
         return results_dict
 
 
     def forward(self, img):
         """
         forward pass of the model.
         :param img: input img (b, c, y, x, (z)).
         :return: rpn_pred_logits: (b, n_anchors, 2)
         :return: rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
         :return: batch_proposal_boxes: (b, n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix)) only for monitoring/plotting.
         :return: detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
         :return: detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
         """
         # Feature extraction
         fpn_outs = self.Fpn(img)
         seg_logits = self.final_conv(fpn_outs[0])
         selected_fmaps = [fpn_outs[i + 1] for i in self.cf.pyramid_levels]
 
         # Loop through pyramid layers
         class_layer_outputs, bb_reg_layer_outputs = [], []  # list of lists
         for p in selected_fmaps:
             class_layer_outputs.append(self.Classifier(p))
             bb_reg_layer_outputs.append(self.BBRegressor(p))
 
         # Concatenate layer outputs
         # Convert from list of lists of level outputs to list of lists
         # of outputs across levels.
         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
         class_logits = list(zip(*class_layer_outputs))
         class_logits = [torch.cat(list(o), dim=1) for o in class_logits][0]
         bb_outputs = list(zip(*bb_reg_layer_outputs))
         bb_outputs = [torch.cat(list(o), dim=1) for o in bb_outputs][0]
 
         # merge batch_dimension and store info in batch_ixs for re-allocation.
         batch_ixs = torch.arange(class_logits.shape[0]).unsqueeze(1).repeat(1, class_logits.shape[1]).view(-1).cuda()
         flat_class_softmax = F.softmax(class_logits.view(-1, class_logits.shape[-1]), 1)
         flat_bb_outputs = bb_outputs.view(-1, bb_outputs.shape[-1])
         detections = refine_detections(self.anchors, flat_class_softmax, flat_bb_outputs, batch_ixs, self.cf)
 
         return detections, class_logits, bb_outputs, seg_logits
diff --git a/models/ufrcnn.py b/models/ufrcnn.py
index a1dd68a..08c31bf 100644
--- a/models/ufrcnn.py
+++ b/models/ufrcnn.py
@@ -1,1019 +1,1273 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Parts are based on https://github.com/multimodallearning/pytorch-mask-rcnn
 published under MIT license.
 """
-
-import utils.model_utils as mutils
-import utils.exp_utils as utils
-from cuda_functions.nms_2D.pth_nms import nms_gpu as nms_2D
-from cuda_functions.nms_3D.pth_nms import nms_gpu as nms_3D
-from cuda_functions.roi_align_2D.roi_align.crop_and_resize import CropAndResizeFunction as ra2D
-from cuda_functions.roi_align_3D.roi_align.crop_and_resize import CropAndResizeFunction as ra3D
+import sys
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils
 
+sys.path.append("..")
+import utils.model_utils as mutils
+import utils.exp_utils as utils
+from custom_extensions.nms import nms
+from custom_extensions.roi_align import roi_align
 
 ############################################################
 # Networks on top of backbone
 ############################################################
 
 class RPN(nn.Module):
     """
     Region Proposal Network.
     """
 
     def __init__(self, cf, conv):
 
         super(RPN, self).__init__()
         self.dim = conv.dim
 
         self.conv_shared = conv(cf.end_filts, cf.n_rpn_features, ks=3, stride=cf.rpn_anchor_stride, pad=1, relu=cf.relu)
         self.conv_class = conv(cf.n_rpn_features, 2 * len(cf.rpn_anchor_ratios), ks=1, stride=1, relu=None)
         self.conv_bbox = conv(cf.n_rpn_features, 2 * self.dim * len(cf.rpn_anchor_ratios), ks=1, stride=1, relu=None)
 
 
     def forward(self, x):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :return: rpn_class_logits (b, 2, n_anchors)
         :return: rpn_probs_logits (b, 2, n_anchors)
         :return: rpn_bbox (b, 2 * dim, n_anchors)
         """
 
         # Shared convolutional base of the RPN.
         x = self.conv_shared(x)
 
         # Anchor Score. (batch, anchors per location * 2, y, x, (z)).
         rpn_class_logits = self.conv_class(x)
         # Reshape to (batch, 2, anchors)
         axes = (0, 2, 3, 1) if self.dim == 2 else (0, 2, 3, 4, 1)
         rpn_class_logits = rpn_class_logits.permute(*axes)
         rpn_class_logits = rpn_class_logits.contiguous()
         rpn_class_logits = rpn_class_logits.view(x.size()[0], -1, 2)
 
         # Softmax on last dimension (fg vs. bg).
         rpn_probs = F.softmax(rpn_class_logits, dim=2)
 
         # Bounding box refinement. (batch, anchors_per_location * (y, x, (z), log(h), log(w), (log(d)), y, x, (z))
         rpn_bbox = self.conv_bbox(x)
 
         # Reshape to (batch, 2*dim, anchors)
         rpn_bbox = rpn_bbox.permute(*axes)
         rpn_bbox = rpn_bbox.contiguous()
         rpn_bbox = rpn_bbox.view(x.size()[0], -1, self.dim * 2)
 
         return [rpn_class_logits, rpn_probs, rpn_bbox]
 
 
 
 class Classifier(nn.Module):
     """
     Head network for classification and bounding box refinement. Performs RoiAlign, processes resulting features through a
     shared convolutional base and finally branches off the classifier- and regression head.
     """
     def __init__(self, cf, conv):
         super(Classifier, self).__init__()
 
         self.dim = conv.dim
         self.in_channels = cf.end_filts
         self.pool_size = cf.pool_size
         self.pyramid_levels = cf.pyramid_levels
         # instance_norm does not work with spatial dims (1, 1, (1))
         norm = cf.norm if cf.norm != 'instance_norm' else None
 
         self.conv1 = conv(cf.end_filts, cf.end_filts * 4, ks=self.pool_size, stride=1, norm=norm, relu=cf.relu)
         self.conv2 = conv(cf.end_filts * 4, cf.end_filts * 4, ks=1, stride=1, norm=norm, relu=cf.relu)
         self.linear_class = nn.Linear(cf.end_filts * 4, cf.head_classes)
         self.linear_bbox = nn.Linear(cf.end_filts * 4, cf.head_classes * 2 * self.dim)
 
     def forward(self, x, rois):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :param rois: normalized box coordinates as proposed by the RPN to be forwarded through
         the second stage (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix). Proposals of all batch elements
         have been merged to one vector, while the origin info has been stored for re-allocation.
         :return: mrcnn_class_logits (n_proposals, n_head_classes)
         :return: mrcnn_bbox (n_proposals, n_head_classes, 2 * dim) predicted corrections to be applied to proposals for refinement.
         """
         x = pyramid_roi_align(x, rois, self.pool_size, self.pyramid_levels, self.dim)
         x = self.conv1(x)
         x = self.conv2(x)
         x = x.view(-1, self.in_channels * 4)
         mrcnn_class_logits = self.linear_class(x)
         mrcnn_bbox = self.linear_bbox(x)
         mrcnn_bbox = mrcnn_bbox.view(mrcnn_bbox.size()[0], -1, self.dim * 2)
 
         return [mrcnn_class_logits, mrcnn_bbox]
 
 
 
 class Mask(nn.Module):
     """
     Head network for proposal-based mask segmentation. Performs RoiAlign, some convolutions and applies sigmoid on the
     output logits to allow for overlapping classes.
     """
     def __init__(self, cf, conv):
         super(Mask, self).__init__()
         self.pool_size = cf.mask_pool_size
         self.pyramid_levels = cf.pyramid_levels
         self.dim = conv.dim
         self.conv1 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv2 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv3 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         self.conv4 = conv(cf.end_filts, cf.end_filts, ks=3, stride=1, pad=1, norm=cf.norm, relu=cf.relu)
         if conv.dim == 2:
             self.deconv = nn.ConvTranspose2d(cf.end_filts, cf.end_filts, kernel_size=2, stride=2)
         else:
             self.deconv = nn.ConvTranspose3d(cf.end_filts, cf.end_filts, kernel_size=2, stride=2)
 
         self.relu = nn.ReLU(inplace=True) if cf.relu == 'relu' else nn.LeakyReLU(inplace=True)
         self.conv5 = conv(cf.end_filts, cf.head_classes, ks=1, stride=1, relu=None)
         self.sigmoid = nn.Sigmoid()
 
     def forward(self, x, rois):
         """
         :param x: input feature maps (b, in_channels, y, x, (z))
         :param rois: normalized box coordinates as proposed by the RPN to be forwarded through
         the second stage (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix). Proposals of all batch elements
         have been merged to one vector, while the origin info has been stored for re-allocation.
         :return: x: masks (n_sampled_proposals (n_detections in inference), n_classes, y, x, (z))
         """
         x = pyramid_roi_align(x, rois, self.pool_size, self.pyramid_levels, self.dim)
         x = self.conv1(x)
         x = self.conv2(x)
         x = self.conv3(x)
         x = self.conv4(x)
         x = self.relu(self.deconv(x))
         x = self.conv5(x)
         x = self.sigmoid(x)
         return x
 
 
 ############################################################
 #  Loss Functions
 ############################################################
 
 def compute_rpn_class_loss(rpn_match, rpn_class_logits, shem_poolsize):
     """
     :param rpn_match: (n_anchors). [-1, 0, 1] for negative, neutral, and positive matched anchors.
     :param rpn_class_logits: (n_anchors, 2). logits from RPN classifier.
     :param shem_poolsize: int. factor of top-k candidates to draw from per negative sample
     (stochastic-hard-example-mining).
     :return: loss: torch tensor
     :return: np_neg_ix: 1D array containing indices of the neg_roi_logits, which have been sampled for training.
     """
 
     # filter out neutral anchors.
     pos_indices = torch.nonzero(rpn_match == 1)
     neg_indices = torch.nonzero(rpn_match == -1)
 
     # loss for positive samples
     if 0 not in pos_indices.size():
         pos_indices = pos_indices.squeeze(1)
         roi_logits_pos = rpn_class_logits[pos_indices]
         pos_loss = F.cross_entropy(roi_logits_pos, torch.LongTensor([1] * pos_indices.shape[0]).cuda())
     else:
         pos_loss = torch.FloatTensor([0]).cuda()
 
     # loss for negative samples: draw hard negative examples (SHEM)
     # that match the number of positive samples, but at least 1.
     if 0 not in neg_indices.size():
         neg_indices = neg_indices.squeeze(1)
         roi_logits_neg = rpn_class_logits[neg_indices]
         negative_count = np.max((1, pos_indices.cpu().data.numpy().size))
         roi_probs_neg = F.softmax(roi_logits_neg, dim=1)
         neg_ix = mutils.shem(roi_probs_neg, negative_count, shem_poolsize)
         neg_loss = F.cross_entropy(roi_logits_neg[neg_ix], torch.LongTensor([0] * neg_ix.shape[0]).cuda())
         np_neg_ix = neg_ix.cpu().data.numpy()
     else:
         neg_loss = torch.FloatTensor([0]).cuda()
         np_neg_ix = np.array([]).astype('int32')
 
     loss = (pos_loss + neg_loss) / 2
     return loss, np_neg_ix
 
 
 def compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas, rpn_match):
     """
     :param rpn_target_deltas:   (b, n_positive_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd)))).
     Uses 0 padding to fill in unsed bbox deltas.
     :param rpn_pred_deltas: predicted deltas from RPN. (b, n_anchors, (dy, dx, (dz), log(dh), log(dw), (log(dd))))
     :param rpn_match: (n_anchors). [-1, 0, 1] for negative, neutral, and positive matched anchors.
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(rpn_match == 1).size():
 
         indices = torch.nonzero(rpn_match == 1).squeeze(1)
         # Pick bbox deltas that contribute to the loss
         rpn_pred_deltas = rpn_pred_deltas[indices]
         # Trim target bounding box deltas to the same length as rpn_bbox.
         target_deltas = rpn_target_deltas[:rpn_pred_deltas.size()[0], :]
         # Smooth L1 loss
         loss = F.smooth_l1_loss(rpn_pred_deltas, target_deltas)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 def compute_mrcnn_class_loss(target_class_ids, pred_class_logits):
     """
     :param target_class_ids: (n_sampled_rois) batch dimension was merged into roi dimension.
     :param pred_class_logits: (n_sampled_rois, n_classes)
     :return: loss: torch 1D tensor.
     """
     if 0 not in target_class_ids.size():
         loss = F.cross_entropy(pred_class_logits, target_class_ids.long())
     else:
         loss = torch.FloatTensor([0.]).cuda()
 
     return loss
 
 
 def compute_mrcnn_bbox_loss(mrcnn_target_deltas, mrcnn_pred_deltas, target_class_ids):
     """
     :param mrcnn_target_deltas: (n_sampled_rois, (dy, dx, (dz), log(dh), log(dw), (log(dh)))
     :param mrcnn_pred_deltas: (n_sampled_rois, n_classes, (dy, dx, (dz), log(dh), log(dw), (log(dh)))
     :param target_class_ids: (n_sampled_rois)
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(target_class_ids > 0).size():
         positive_roi_ix = torch.nonzero(target_class_ids > 0)[:, 0]
         positive_roi_class_ids = target_class_ids[positive_roi_ix].long()
         target_bbox = mrcnn_target_deltas[positive_roi_ix, :].detach()
         pred_bbox = mrcnn_pred_deltas[positive_roi_ix, positive_roi_class_ids, :]
         loss = F.smooth_l1_loss(pred_bbox, target_bbox)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 def compute_mrcnn_mask_loss(target_masks, pred_masks, target_class_ids):
     """
     :param target_masks: (n_sampled_rois, y, x, (z)) A float32 tensor of values 0 or 1. Uses zero padding to fill array.
     :param pred_masks: (n_sampled_rois, n_classes, y, x, (z)) float32 tensor with values between [0, 1].
     :param target_class_ids: (n_sampled_rois)
     :return: loss: torch 1D tensor.
     """
     if 0 not in torch.nonzero(target_class_ids > 0).size():
         # Only positive ROIs contribute to the loss. And only
         # the class specific mask of each ROI.
         positive_ix = torch.nonzero(target_class_ids > 0)[:, 0]
         positive_class_ids = target_class_ids[positive_ix].long()
         y_true = target_masks[positive_ix, :, :].detach()
         y_pred = pred_masks[positive_ix, positive_class_ids, :, :]
         loss = F.binary_cross_entropy(y_pred, y_true)
     else:
         loss = torch.FloatTensor([0]).cuda()
 
     return loss
 
 
 ############################################################
 #  Helper Layers
 ############################################################
 
-def proposal_layer(rpn_pred_probs, rpn_pred_deltas, proposal_count, anchors, cf):
+# def proposal_layer(rpn_pred_probs, rpn_pred_deltas, proposal_count, anchors, cf):
+#     """
+#     Receives anchor scores and selects a subset to pass as proposals
+#     to the second stage. Filtering is done based on anchor scores and
+#     non-max suppression to remove overlaps. It also applies bounding
+#     box refinment detals to anchors.
+#     :param rpn_pred_probs: (b, n_anchors, 2)
+#     :param rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
+#     :return: batch_normalized_boxes: Proposals in normalized coordinates
+#     (b, proposal_count, (y1, x1, y2, x2, (z1), (z2)))
+#     :return: batch_out_proposals: Box coords + RPN foreground scores
+#     for monitoring/plotting (b, proposal_count, (y1, x1, y2, x2, (z1), (z2), score))
+#     """
+#     batch_scores = rpn_pred_probs[:, :, 1]
+#     batch_deltas = rpn_pred_deltas
+#     batch_anchors = anchors
+#     batch_normalized_boxes = []
+#     batch_out_proposals = []
+#
+#     # loop over batch dimension.
+#     for ix in range(batch_scores.shape[0]):
+#
+#         scores = batch_scores[ix]
+#         deltas = batch_deltas[ix]
+#         anchors = batch_anchors.clone()
+#         # norm deltas
+#         std_dev = torch.from_numpy(cf.rpn_bbox_std_dev[None]).float().cuda()
+#         deltas = deltas * std_dev
+#
+#         # improve performance by trimming to top anchors by score
+#         # and doing the rest on the smaller subset.
+#         pre_nms_limit = min(cf.pre_nms_limit, anchors.size()[0])
+#         scores, order = scores.sort(descending=True)
+#         order = order[:pre_nms_limit]
+#         scores = scores[:pre_nms_limit]
+#         deltas = deltas[order, :]
+#         anchors = anchors[order, :]
+#
+#         # apply deltas to anchors to get refined anchors and filter with non-maximum surpression.
+#         if batch_deltas.shape[-1] == 4:
+#             boxes = mutils.apply_box_deltas_2D(anchors, deltas)
+#             boxes = mutils.clip_boxes_2D(boxes, cf.window)
+#             keep = nms_2D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
+#             norm = torch.from_numpy(cf.scale).float().cuda()
+#
+#         else:
+#             boxes = mutils.apply_box_deltas_3D(anchors, deltas)
+#             boxes = mutils.clip_boxes_3D(boxes, cf.window)
+#             keep = nms_3D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
+#             norm = torch.from_numpy(cf.scale).float().cuda()
+#
+#         keep = keep[:proposal_count]
+#         boxes = boxes[keep, :]
+#         rpn_scores = scores[keep][:, None]
+#
+#         # pad missing boxes with 0.
+#         if boxes.shape[0] < proposal_count:
+#             n_pad_boxes = proposal_count - boxes.shape[0]
+#             zeros = torch.zeros([n_pad_boxes, boxes.shape[1]]).cuda()
+#             boxes = torch.cat([boxes, zeros], dim=0)
+#             zeros = torch.zeros([n_pad_boxes, rpn_scores.shape[1]]).cuda()
+#             rpn_scores = torch.cat([rpn_scores, zeros], dim=0)
+#
+#         # concat box and score info for monitoring/plotting.
+#         batch_out_proposals.append(torch.cat((boxes, rpn_scores), 1).cpu().data.numpy())
+#         # normalize dimensions to range of 0 to 1.
+#         normalized_boxes = boxes / norm
+#         # add back batch dimension
+#         batch_normalized_boxes.append(normalized_boxes.unsqueeze(0))
+#
+#     batch_normalized_boxes = torch.cat(batch_normalized_boxes)
+#     batch_out_proposals = np.array(batch_out_proposals)
+#     return batch_normalized_boxes, batch_out_proposals
+
+def refine_proposals(rpn_pred_probs, rpn_pred_deltas, proposal_count, batch_anchors, cf):
     """
     Receives anchor scores and selects a subset to pass as proposals
     to the second stage. Filtering is done based on anchor scores and
     non-max suppression to remove overlaps. It also applies bounding
-    box refinment detals to anchors.
+    box refinment details to anchors.
     :param rpn_pred_probs: (b, n_anchors, 2)
     :param rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
-    :return: batch_normalized_boxes: Proposals in normalized coordinates
-    (b, proposal_count, (y1, x1, y2, x2, (z1), (z2)))
+    :return: batch_normalized_props: Proposals in normalized coordinates (b, proposal_count, (y1, x1, y2, x2, (z1), (z2), score))
     :return: batch_out_proposals: Box coords + RPN foreground scores
     for monitoring/plotting (b, proposal_count, (y1, x1, y2, x2, (z1), (z2), score))
     """
+    std_dev = torch.from_numpy(cf.rpn_bbox_std_dev[None]).float().cuda()
+    norm = torch.from_numpy(cf.scale).float().cuda()
+    anchors = batch_anchors.clone()
+
+
+
     batch_scores = rpn_pred_probs[:, :, 1]
-    batch_deltas = rpn_pred_deltas
-    batch_anchors = anchors
-    batch_normalized_boxes = []
+    # norm deltas
+    batch_deltas = rpn_pred_deltas * std_dev
+    batch_normalized_props = []
     batch_out_proposals = []
 
     # loop over batch dimension.
     for ix in range(batch_scores.shape[0]):
 
         scores = batch_scores[ix]
         deltas = batch_deltas[ix]
-        anchors = batch_anchors.clone()
-        # norm deltas
-        std_dev = torch.from_numpy(cf.rpn_bbox_std_dev[None]).float().cuda()
-        deltas = deltas * std_dev
 
         # improve performance by trimming to top anchors by score
         # and doing the rest on the smaller subset.
         pre_nms_limit = min(cf.pre_nms_limit, anchors.size()[0])
         scores, order = scores.sort(descending=True)
         order = order[:pre_nms_limit]
         scores = scores[:pre_nms_limit]
         deltas = deltas[order, :]
-        anchors = anchors[order, :]
 
-        # apply deltas to anchors to get refined anchors and filter with non-maximum surpression.
+        # apply deltas to anchors to get refined anchors and filter with non-maximum suppression.
         if batch_deltas.shape[-1] == 4:
-            boxes = mutils.apply_box_deltas_2D(anchors, deltas)
+            boxes = mutils.apply_box_deltas_2D(anchors[order, :], deltas)
             boxes = mutils.clip_boxes_2D(boxes, cf.window)
-            keep = nms_2D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
-            norm = torch.from_numpy(cf.scale).float().cuda()
-
         else:
-            boxes = mutils.apply_box_deltas_3D(anchors, deltas)
+            boxes = mutils.apply_box_deltas_3D(anchors[order, :], deltas)
             boxes = mutils.clip_boxes_3D(boxes, cf.window)
-            keep = nms_3D(torch.cat((boxes, scores.unsqueeze(1)), 1), cf.rpn_nms_threshold)
-            norm = torch.from_numpy(cf.scale).float().cuda()
+        # boxes are y1,x1,y2,x2, torchvision-nms requires x1,y1,x2,y2, but consistent swap x<->y is irrelevant.
+        keep = nms.nms(boxes, scores, cf.rpn_nms_threshold)
+
 
         keep = keep[:proposal_count]
         boxes = boxes[keep, :]
         rpn_scores = scores[keep][:, None]
 
         # pad missing boxes with 0.
         if boxes.shape[0] < proposal_count:
             n_pad_boxes = proposal_count - boxes.shape[0]
             zeros = torch.zeros([n_pad_boxes, boxes.shape[1]]).cuda()
             boxes = torch.cat([boxes, zeros], dim=0)
             zeros = torch.zeros([n_pad_boxes, rpn_scores.shape[1]]).cuda()
             rpn_scores = torch.cat([rpn_scores, zeros], dim=0)
 
         # concat box and score info for monitoring/plotting.
         batch_out_proposals.append(torch.cat((boxes, rpn_scores), 1).cpu().data.numpy())
         # normalize dimensions to range of 0 to 1.
         normalized_boxes = boxes / norm
-        # add back batch dimension
-        batch_normalized_boxes.append(normalized_boxes.unsqueeze(0))
+        assert torch.all(normalized_boxes <= 1), "normalized box coords >1 found"
 
-    batch_normalized_boxes = torch.cat(batch_normalized_boxes)
+        # add again batch dimension
+        batch_normalized_props.append(normalized_boxes.unsqueeze(0))
+
+    batch_normalized_props = torch.cat(batch_normalized_props)
     batch_out_proposals = np.array(batch_out_proposals)
-    return batch_normalized_boxes, batch_out_proposals
+
+    return batch_normalized_props, batch_out_proposals
 
 
 
+# def pyramid_roi_align(feature_maps, rois, pool_size, pyramid_levels, dim):
+#     """
+#     Implements ROI Pooling on multiple levels of the feature pyramid.
+#     :param feature_maps: list of feature maps, each of shape (b, c, y, x , (z))
+#     :param rois: proposals (normalized coords.) as returned by RPN. contain info about original batch element allocation.
+#     (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ixs)
+#     :param pool_size: list of poolsizes in dims: [x, y, (z)]
+#     :param pyramid_levels: list. [0, 1, 2, ...]
+#     :return: pooled: pooled feature map rois (n_proposals, c, poolsize_y, poolsize_x, (poolsize_z))
+#
+#     Output:
+#     Pooled regions in the shape: [num_boxes, height, width, channels].
+#     The width and height are those specific in the pool_shape in the layer
+#     constructor.
+#     """
+#     boxes = rois[:, :dim*2]
+#     batch_ixs = rois[:, dim*2]
+#
+#     # Assign each ROI to a level in the pyramid based on the ROI area.
+#     if dim == 2:
+#         y1, x1, y2, x2 = boxes.chunk(4, dim=1)
+#     else:
+#         y1, x1, y2, x2, z1, z2 = boxes.chunk(6, dim=1)
+#
+#     h = y2 - y1
+#     w = x2 - x1
+#
+#     # Equation 1 in https://arxiv.org/abs/1612.03144. Account for
+#     # the fact that our coordinates are normalized here.
+#     # divide sqrt(h*w) by 1 instead image_area.
+#     roi_level = (4 + mutils.log2(torch.sqrt(h*w))).round().int().clamp(pyramid_levels[0], pyramid_levels[-1])
+#     # if Pyramid contains additional level P6, adapt the roi_level assignemnt accordingly.
+#     if len(pyramid_levels) == 5:
+#         roi_level[h*w > 0.65] = 5
+#
+#     # Loop through levels and apply ROI pooling to each.
+#     pooled = []
+#     box_to_level = []
+#     for level_ix, level in enumerate(pyramid_levels):
+#         ix = roi_level == level
+#         if not ix.any():
+#             continue
+#         ix = torch.nonzero(ix)[:, 0]
+#         level_boxes = boxes[ix, :]
+#         # re-assign rois to feature map of original batch element.
+#         ind = batch_ixs[ix].int()
+#
+#         # Keep track of which box is mapped to which level
+#         box_to_level.append(ix)
+#
+#         # Stop gradient propogation to ROI proposals
+#         level_boxes = level_boxes.detach()
+#
+#         # Crop and Resize
+#         # From Mask R-CNN paper: "We sample four regular locations, so
+#         # that we can evaluate either max or average pooling. In fact,
+#         # interpolating only a single value at each bin center (without
+#         # pooling) is nearly as effective."
+#         #
+#         # Here we use the simplified approach of a single value per bin,
+#         # which is how is done in tf.crop_and_resize()
+#         #
+#         # Also fixed a bug from original implementation, reported in:
+#         # https://hackernoon.com/how-tensorflows-tf-image-resize-stole-60-days-of-my-life-aba5eb093f35
+#
+#         if len(pool_size) == 2:
+#             pooled_features = ra2D(pool_size[0], pool_size[1], 0)(feature_maps[level_ix], level_boxes, ind)
+#         else:
+#             pooled_features = ra3D(pool_size[0], pool_size[1], pool_size[2], 0)(feature_maps[level_ix], level_boxes, ind)
+#
+#         pooled.append(pooled_features)
+#
+#
+#     # Pack pooled features into one tensor
+#     pooled = torch.cat(pooled, dim=0)
+#
+#     # Pack box_to_level mapping into one array and add another
+#     # column representing the order of pooled boxes
+#     box_to_level = torch.cat(box_to_level, dim=0)
+#
+#     # Rearrange pooled features to match the order of the original boxes
+#     _, box_to_level = torch.sort(box_to_level)
+#     pooled = pooled[box_to_level, :, :]
+#
+#     return pooled
+
 def pyramid_roi_align(feature_maps, rois, pool_size, pyramid_levels, dim):
     """
     Implements ROI Pooling on multiple levels of the feature pyramid.
     :param feature_maps: list of feature maps, each of shape (b, c, y, x , (z))
     :param rois: proposals (normalized coords.) as returned by RPN. contain info about original batch element allocation.
     (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ixs)
     :param pool_size: list of poolsizes in dims: [x, y, (z)]
     :param pyramid_levels: list. [0, 1, 2, ...]
     :return: pooled: pooled feature map rois (n_proposals, c, poolsize_y, poolsize_x, (poolsize_z))
 
     Output:
     Pooled regions in the shape: [num_boxes, height, width, channels].
     The width and height are those specific in the pool_shape in the layer
     constructor.
     """
     boxes = rois[:, :dim*2]
     batch_ixs = rois[:, dim*2]
 
     # Assign each ROI to a level in the pyramid based on the ROI area.
     if dim == 2:
         y1, x1, y2, x2 = boxes.chunk(4, dim=1)
     else:
         y1, x1, y2, x2, z1, z2 = boxes.chunk(6, dim=1)
 
     h = y2 - y1
     w = x2 - x1
 
     # Equation 1 in https://arxiv.org/abs/1612.03144. Account for
     # the fact that our coordinates are normalized here.
     # divide sqrt(h*w) by 1 instead image_area.
-    roi_level = (4 + mutils.log2(torch.sqrt(h*w))).round().int().clamp(pyramid_levels[0], pyramid_levels[-1])
-    # if Pyramid contains additional level P6, adapt the roi_level assignemnt accordingly.
+    roi_level = (4 + torch.log2(torch.sqrt(h*w))).round().int().clamp(pyramid_levels[0], pyramid_levels[-1])
+    # if Pyramid contains additional level P6, adapt the roi_level assignment accordingly.
     if len(pyramid_levels) == 5:
         roi_level[h*w > 0.65] = 5
 
     # Loop through levels and apply ROI pooling to each.
     pooled = []
     box_to_level = []
+    fmap_shapes = [f.shape for f in feature_maps]
     for level_ix, level in enumerate(pyramid_levels):
         ix = roi_level == level
         if not ix.any():
             continue
         ix = torch.nonzero(ix)[:, 0]
         level_boxes = boxes[ix, :]
         # re-assign rois to feature map of original batch element.
         ind = batch_ixs[ix].int()
 
         # Keep track of which box is mapped to which level
         box_to_level.append(ix)
 
         # Stop gradient propogation to ROI proposals
         level_boxes = level_boxes.detach()
-
-        # Crop and Resize
-        # From Mask R-CNN paper: "We sample four regular locations, so
-        # that we can evaluate either max or average pooling. In fact,
-        # interpolating only a single value at each bin center (without
-        # pooling) is nearly as effective."
-        #
-        # Here we use the simplified approach of a single value per bin,
-        # which is how is done in tf.crop_and_resize()
-        #
-        # Also fixed a bug from original implementation, reported in:
-        # https://hackernoon.com/how-tensorflows-tf-image-resize-stole-60-days-of-my-life-aba5eb093f35
-
         if len(pool_size) == 2:
-            pooled_features = ra2D(pool_size[0], pool_size[1], 0)(feature_maps[level_ix], level_boxes, ind)
+            # remap to feature map coordinate system
+            y_exp, x_exp = fmap_shapes[level_ix][2:]  # exp = expansion
+            level_boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp], dtype=torch.float32).cuda())
+            pooled_features = roi_align.roi_align_2d(feature_maps[level_ix],
+                                                     torch.cat((ind.unsqueeze(1).float(), level_boxes), dim=1),
+                                                     pool_size)
         else:
-            pooled_features = ra3D(pool_size[0], pool_size[1], pool_size[2], 0)(feature_maps[level_ix], level_boxes, ind)
-
+            y_exp, x_exp, z_exp = fmap_shapes[level_ix][2:]
+            level_boxes.mul_(torch.tensor([y_exp, x_exp, y_exp, x_exp, z_exp, z_exp], dtype=torch.float32).cuda())
+            pooled_features = roi_align.roi_align_3d(feature_maps[level_ix],
+                                                     torch.cat((ind.unsqueeze(1).float(), level_boxes), dim=1),
+                                                     pool_size)
         pooled.append(pooled_features)
 
 
     # Pack pooled features into one tensor
     pooled = torch.cat(pooled, dim=0)
 
     # Pack box_to_level mapping into one array and add another
     # column representing the order of pooled boxes
     box_to_level = torch.cat(box_to_level, dim=0)
 
     # Rearrange pooled features to match the order of the original boxes
     _, box_to_level = torch.sort(box_to_level)
     pooled = pooled[box_to_level, :, :]
 
     return pooled
 
 
 
 def detection_target_layer(batch_proposals, batch_mrcnn_class_scores, batch_gt_class_ids, batch_gt_boxes, cf):
     """
     Subsamples proposals for mrcnn losses and generates targets. Sampling is done per batch element, seems to have positive
     effects on training, as opposed to sampling over entire batch. Negatives are sampled via stochastic-hard-example-mining
     (SHEM), where a number of negative proposals are drawn from larger pool of highest scoring proposals for stochasticity.
     Scoring is obtained here as the max over all foreground probabilities as returned by mrcnn_classifier (worked better than
     loss-based class balancing methods like "online-hard-example-mining" or "focal loss".)
 
     :param batch_proposals: (n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ixs).
     boxes as proposed by RPN. n_proposals here is determined by batch_size * POST_NMS_ROIS.
     :param batch_mrcnn_class_scores: (n_proposals, n_classes)
     :param batch_gt_class_ids: list over batch elements. Each element is a list over the corresponding roi target labels.
     :param batch_gt_boxes: list over batch elements. Each element is a list over the corresponding roi target coordinates.
     :param batch_gt_masks: list over batch elements. Each element is binary mask of shape (n_gt_rois, y, x, (z), c)
     :return: sample_indices: (n_sampled_rois) indices of sampled proposals to be used for loss functions.
     :return: target_class_ids: (n_sampled_rois)containing target class labels of sampled proposals.
     :return: target_deltas: (n_sampled_rois, 2 * dim) containing target deltas of sampled proposals for box refinement.
     :return: target_masks: (n_sampled_rois, y, x, (z)) containing target masks of sampled proposals.
     """
     # normalization of target coordinates
     if cf.dim == 2:
         h, w = cf.patch_size
         scale = torch.from_numpy(np.array([h, w, h, w])).float().cuda()
     else:
         h, w, z = cf.patch_size
         scale = torch.from_numpy(np.array([h, w, h, w, z, z])).float().cuda()
 
 
     positive_count = 0
     negative_count = 0
     sample_positive_indices = []
     sample_negative_indices = []
     sample_deltas = []
     sample_class_ids = []
 
+    std_dev = torch.from_numpy(cf.bbox_std_dev).float().cuda()
+
     # loop over batch and get positive and negative sample rois.
     for b in range(len(batch_gt_class_ids)):
 
         gt_class_ids = torch.from_numpy(batch_gt_class_ids[b]).int().cuda()
         if np.any(batch_gt_class_ids[b] > 0):  # skip roi selection for no gt images.
             gt_boxes = torch.from_numpy(batch_gt_boxes[b]).float().cuda() / scale
         else:
             gt_boxes = torch.FloatTensor().cuda()
 
         # get proposals and indices of current batch element.
         proposals = batch_proposals[batch_proposals[:, -1] == b][:, :-1]
         batch_element_indices = torch.nonzero(batch_proposals[:, -1] == b).squeeze(1)
 
         # Compute overlaps matrix [proposals, gt_boxes]
         if 0 not in gt_boxes.size():
             if gt_boxes.shape[1] == 4:
                 overlaps = mutils.bbox_overlaps_2D(proposals, gt_boxes)
             else:
                 overlaps = mutils.bbox_overlaps_3D(proposals, gt_boxes)
 
             # Determine postive and negative ROIs
             roi_iou_max = torch.max(overlaps, dim=1)[0]
             # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
             positive_roi_bool = roi_iou_max >= (0.5 if cf.dim == 2 else 0.3)
             # 2. Negative ROIs are those with < 0.1 with every GT box.
             negative_roi_bool = roi_iou_max < (0.1 if cf.dim == 2 else 0.01)
         else:
             positive_roi_bool = torch.FloatTensor().cuda()
             negative_roi_bool = torch.from_numpy(np.array([1]*proposals.shape[0])).cuda()
 
         # Sample Positive ROIs
         if 0 not in torch.nonzero(positive_roi_bool).size():
             positive_indices = torch.nonzero(positive_roi_bool).squeeze(1)
             positive_samples = int(cf.train_rois_per_image * cf.roi_positive_ratio)
             rand_idx = torch.randperm(positive_indices.size()[0])
             rand_idx = rand_idx[:positive_samples].cuda()
             positive_indices = positive_indices[rand_idx]
             positive_samples = positive_indices.size()[0]
             positive_rois = proposals[positive_indices, :]
             # Assign positive ROIs to GT boxes.
             positive_overlaps = overlaps[positive_indices, :]
             roi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1]
             roi_gt_boxes = gt_boxes[roi_gt_box_assignment, :]
             roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment]
 
             # Compute bbox refinement targets for positive ROIs
             deltas = mutils.box_refinement(positive_rois, roi_gt_boxes)
-            std_dev = torch.from_numpy(cf.bbox_std_dev).float().cuda()
             deltas /= std_dev
 
             sample_positive_indices.append(batch_element_indices[positive_indices])
             sample_deltas.append(deltas)
             sample_class_ids.append(roi_gt_class_ids)
             positive_count += positive_samples
         else:
             positive_samples = 0
 
         # Negative ROIs. Add enough to maintain positive:negative ratio, but at least 1. Sample via SHEM.
         if 0 not in torch.nonzero(negative_roi_bool).size():
             negative_indices = torch.nonzero(negative_roi_bool).squeeze(1)
             r = 1.0 / cf.roi_positive_ratio
             b_neg_count = np.max((int(r * positive_samples - positive_samples), 1))
             roi_probs_neg = batch_mrcnn_class_scores[batch_element_indices[negative_indices]]
             raw_sampled_indices = mutils.shem(roi_probs_neg, b_neg_count, cf.shem_poolsize)
             sample_negative_indices.append(batch_element_indices[negative_indices[raw_sampled_indices]])
             negative_count += raw_sampled_indices.size()[0]
 
     if len(sample_positive_indices) > 0:
         target_deltas = torch.cat(sample_deltas)
         target_class_ids = torch.cat(sample_class_ids)
 
     # Pad target information with zeros for negative ROIs.
     if positive_count > 0 and negative_count > 0:
         sample_indices = torch.cat((torch.cat(sample_positive_indices), torch.cat(sample_negative_indices)), dim=0)
         zeros = torch.zeros(negative_count).int().cuda()
         target_class_ids = torch.cat([target_class_ids, zeros], dim=0)
         zeros = torch.zeros(negative_count, cf.dim * 2).cuda()
         target_deltas = torch.cat([target_deltas, zeros], dim=0)
     elif positive_count > 0:
         sample_indices = torch.cat(sample_positive_indices)
     elif negative_count > 0:
         sample_indices = torch.cat(sample_negative_indices)
         zeros = torch.zeros(negative_count).int().cuda()
         target_class_ids = zeros
         zeros = torch.zeros(negative_count, cf.dim * 2).cuda()
         target_deltas = zeros
     else:
         sample_indices = torch.LongTensor().cuda()
         target_class_ids = torch.IntTensor().cuda()
         target_deltas = torch.FloatTensor().cuda()
 
     return sample_indices, target_class_ids, target_deltas
 
 
 ############################################################
 #  Output Handler
 ############################################################
 
-def refine_detections(rois, probs, deltas, batch_ixs, cf):
+# def refine_detections(rois, probs, deltas, batch_ixs, cf):
+#     """
+#     Refine classified proposals, filter overlaps and return final detections.
+#
+#     :param rois: (n_proposals, 2 * dim) normalized boxes as proposed by RPN. n_proposals = batch_size * POST_NMS_ROIS
+#     :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by mrcnn classifier.
+#     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by mrcnn bbox regressor.
+#     :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
+#     :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
+#     """
+#     # class IDs per ROI. Since scores of all classes are of interest (not just max class), all are kept at this point.
+#     class_ids = []
+#     fg_classes = cf.head_classes - 1
+#     # repeat vectors to fill in predictions for all foreground classes.
+#     for ii in range(1, fg_classes + 1):
+#         class_ids += [ii] * rois.shape[0]
+#     class_ids = torch.from_numpy(np.array(class_ids)).cuda()
+#
+#     rois = rois.repeat(fg_classes, 1)
+#     probs = probs.repeat(fg_classes, 1)
+#     deltas = deltas.repeat(fg_classes, 1, 1)
+#     batch_ixs = batch_ixs.repeat(fg_classes)
+#
+#     # get class-specific scores and  bounding box deltas
+#     idx = torch.arange(class_ids.size()[0]).long().cuda()
+#     class_scores = probs[idx, class_ids]
+#     deltas_specific = deltas[idx, class_ids]
+#     batch_ixs = batch_ixs[idx]
+#
+#     # apply bounding box deltas. re-scale to image coordinates.
+#     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
+#     scale = torch.from_numpy(cf.scale).float().cuda()
+#     refined_rois = mutils.apply_box_deltas_2D(rois, deltas_specific * std_dev) * scale if cf.dim == 2 else \
+#         mutils.apply_box_deltas_3D(rois, deltas_specific * std_dev) * scale
+#
+#     # round and cast to int since we're deadling with pixels now
+#     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
+#     refined_rois = torch.round(refined_rois)
+#
+#     # filter out low confidence boxes
+#     keep = idx
+#     keep_bool = (class_scores >= cf.model_min_confidence)
+#     if 0 not in torch.nonzero(keep_bool).size():
+#
+#         score_keep = torch.nonzero(keep_bool)[:, 0]
+#         pre_nms_class_ids = class_ids[score_keep]
+#         pre_nms_rois = refined_rois[score_keep]
+#         pre_nms_scores = class_scores[score_keep]
+#         pre_nms_batch_ixs = batch_ixs[score_keep]
+#
+#         for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
+#
+#             bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
+#             bix_class_ids = pre_nms_class_ids[bixs]
+#             bix_rois = pre_nms_rois[bixs]
+#             bix_scores = pre_nms_scores[bixs]
+#
+#             for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
+#
+#                 ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
+#                 # nms expects boxes sorted by score.
+#                 ix_rois = bix_rois[ixs]
+#                 ix_scores = bix_scores[ixs]
+#                 ix_scores, order = ix_scores.sort(descending=True)
+#                 ix_rois = ix_rois[order, :]
+#
+#                 if cf.dim == 2:
+#                     class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+#                 else:
+#                     class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+#
+#                 # map indices back.
+#                 class_keep = keep[score_keep[bixs[ixs[order[class_keep]]]]]
+#                 # merge indices over classes for current batch element
+#                 b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
+#
+#             # only keep top-k boxes of current batch-element
+#             top_ids = class_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
+#             b_keep = b_keep[top_ids]
+#
+#             # merge indices over batch elements.
+#             batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
+#
+#         keep = batch_keep
+#
+#     else:
+#         keep = torch.tensor([0]).long().cuda()
+#
+#     # arrange output
+#     result = torch.cat((refined_rois[keep],
+#                         batch_ixs[keep].unsqueeze(1),
+#                         class_ids[keep].unsqueeze(1).float(),
+#                         class_scores[keep].unsqueeze(1)), dim=1)
+#
+#     return result
+
+
+def refine_detections(cf, batch_ixs, rois, deltas, scores):
     """
-    Refine classified proposals, filter overlaps and return final detections.
+    Refine classified proposals (apply deltas to rpn rois), filter overlaps (nms) and return final detections.
 
     :param rois: (n_proposals, 2 * dim) normalized boxes as proposed by RPN. n_proposals = batch_size * POST_NMS_ROIS
-    :param probs: (n_proposals, n_classes) softmax probabilities for all rois as predicted by mrcnn classifier.
     :param deltas: (n_proposals, n_classes, 2 * dim) box refinement deltas as predicted by mrcnn bbox regressor.
-    :param batch_ixs: (n_proposals) batch element assignemnt info for re-allocation.
-    :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score))
+    :param batch_ixs: (n_proposals) batch element assignment info for re-allocation.
+    :param scores: (n_proposals, n_classes) probabilities for all classes per roi as predicted by mrcnn classifier.
+    :return: result: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score, *regression vector features))
     """
     # class IDs per ROI. Since scores of all classes are of interest (not just max class), all are kept at this point.
     class_ids = []
     fg_classes = cf.head_classes - 1
     # repeat vectors to fill in predictions for all foreground classes.
     for ii in range(1, fg_classes + 1):
         class_ids += [ii] * rois.shape[0]
     class_ids = torch.from_numpy(np.array(class_ids)).cuda()
 
+    batch_ixs = batch_ixs.repeat(fg_classes)
     rois = rois.repeat(fg_classes, 1)
-    probs = probs.repeat(fg_classes, 1)
     deltas = deltas.repeat(fg_classes, 1, 1)
-    batch_ixs = batch_ixs.repeat(fg_classes)
+    scores = scores.repeat(fg_classes, 1)
 
     # get class-specific scores and  bounding box deltas
     idx = torch.arange(class_ids.size()[0]).long().cuda()
-    class_scores = probs[idx, class_ids]
-    deltas_specific = deltas[idx, class_ids]
+    # using idx instead of slice [:,] squashes first dimension.
+    #len(class_ids)>scores.shape[1] --> probs is broadcasted by expansion from fg_classes-->len(class_ids)
     batch_ixs = batch_ixs[idx]
+    deltas_specific = deltas[idx, class_ids]
+    class_scores = scores[idx, class_ids]
 
     # apply bounding box deltas. re-scale to image coordinates.
     std_dev = torch.from_numpy(np.reshape(cf.rpn_bbox_std_dev, [1, cf.dim * 2])).float().cuda()
     scale = torch.from_numpy(cf.scale).float().cuda()
     refined_rois = mutils.apply_box_deltas_2D(rois, deltas_specific * std_dev) * scale if cf.dim == 2 else \
         mutils.apply_box_deltas_3D(rois, deltas_specific * std_dev) * scale
 
-    # round and cast to int since we're deadling with pixels now
+    # round and cast to int since we're dealing with pixels now
     refined_rois = mutils.clip_to_window(cf.window, refined_rois)
     refined_rois = torch.round(refined_rois)
 
     # filter out low confidence boxes
     keep = idx
     keep_bool = (class_scores >= cf.model_min_confidence)
-    if 0 not in torch.nonzero(keep_bool).size():
+    if not 0 in torch.nonzero(keep_bool).size():
 
         score_keep = torch.nonzero(keep_bool)[:, 0]
         pre_nms_class_ids = class_ids[score_keep]
         pre_nms_rois = refined_rois[score_keep]
         pre_nms_scores = class_scores[score_keep]
         pre_nms_batch_ixs = batch_ixs[score_keep]
 
         for j, b in enumerate(mutils.unique1d(pre_nms_batch_ixs)):
 
             bixs = torch.nonzero(pre_nms_batch_ixs == b)[:, 0]
             bix_class_ids = pre_nms_class_ids[bixs]
             bix_rois = pre_nms_rois[bixs]
             bix_scores = pre_nms_scores[bixs]
 
             for i, class_id in enumerate(mutils.unique1d(bix_class_ids)):
 
                 ixs = torch.nonzero(bix_class_ids == class_id)[:, 0]
                 # nms expects boxes sorted by score.
                 ix_rois = bix_rois[ixs]
                 ix_scores = bix_scores[ixs]
                 ix_scores, order = ix_scores.sort(descending=True)
                 ix_rois = ix_rois[order, :]
 
-                if cf.dim == 2:
-                    class_keep = nms_2D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
-                else:
-                    class_keep = nms_3D(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1), cf.detection_nms_threshold)
+                class_keep = nms.nms(ix_rois, ix_scores, cf.detection_nms_threshold)
 
                 # map indices back.
                 class_keep = keep[score_keep[bixs[ixs[order[class_keep]]]]]
                 # merge indices over classes for current batch element
                 b_keep = class_keep if i == 0 else mutils.unique1d(torch.cat((b_keep, class_keep)))
 
             # only keep top-k boxes of current batch-element
             top_ids = class_scores[b_keep].sort(descending=True)[1][:cf.model_max_instances_per_batch_element]
             b_keep = b_keep[top_ids]
 
             # merge indices over batch elements.
-            batch_keep = b_keep if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
+            batch_keep = b_keep  if j == 0 else mutils.unique1d(torch.cat((batch_keep, b_keep)))
 
         keep = batch_keep
 
     else:
         keep = torch.tensor([0]).long().cuda()
 
     # arrange output
-    result = torch.cat((refined_rois[keep],
-                        batch_ixs[keep].unsqueeze(1),
-                        class_ids[keep].unsqueeze(1).float(),
-                        class_scores[keep].unsqueeze(1)), dim=1)
+    output = [refined_rois[keep], batch_ixs[keep].unsqueeze(1)]
+    output += [class_ids[keep].unsqueeze(1).float(), class_scores[keep].unsqueeze(1)]
 
+    result = torch.cat(output, dim=1)
+    # shape: (n_keeps, catted feats), catted feats: [0:dim*2] are box_coords, [dim*2] are batch_ics,
+    # [dim*2+1] are class_ids, [dim*2+2] are scores, [dim*2+3:] are regression vector features (incl uncertainty)
     return result
 
 
 def get_results(cf, img_shape, detections, seg_logits, box_results_list=None):
     """
     Restores batch dimension of merged detections, unmolds detections, creates and fills results dict.
     :param img_shape:
     :param detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
     :param detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
     :param box_results_list: None or list of output boxes for monitoring/plotting.
     each element is a list of boxes per batch element.
     :param return_masks: boolean. If True, full resolution masks are returned for all proposals (speed trade-off).
     :return: results_dict: dictionary with keys:
              'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                       [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
              'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, 1] only fg. vs. bg for now.
              class-specific return of masks will come with implementation of instance segmentation evaluation.
     """
     detections = detections.cpu().data.numpy()
 
     # restore batch dimension of merged detections using the batch_ix info.
     batch_ixs = detections[:, cf.dim*2]
     detections = [detections[batch_ixs == ix] for ix in range(img_shape[0])]
 
     # for test_forward, where no previous list exists.
     if box_results_list is None:
         box_results_list = [[] for _ in range(img_shape[0])]
 
     seg_preds = []
     # loop over batch and unmold detections.
     for ix in range(img_shape[0]):
 
         if 0 not in detections[ix].shape:
             boxes = detections[ix][:, :2 * cf.dim].astype(np.int32)
             class_ids = detections[ix][:, 2 * cf.dim + 1].astype(np.int32)
             scores = detections[ix][:, 2 * cf.dim + 2]
 
             # Filter out detections with zero area. Often only happens in early
             # stages of training when the network weights are still a bit random.
             if cf.dim == 2:
                 exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
             else:
                 exclude_ix = np.where(
                     (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 5] - boxes[:, 4]) <= 0)[0]
 
             if exclude_ix.shape[0] > 0:
                 boxes = np.delete(boxes, exclude_ix, axis=0)
                 class_ids = np.delete(class_ids, exclude_ix, axis=0)
                 scores = np.delete(scores, exclude_ix, axis=0)
 
             # add final perdictions to results.
             if 0 not in boxes.shape:
                 for ix2, score in enumerate(scores):
                     if score >= cf.model_min_confidence:
                         box_results_list[ix].append({'box_coords': boxes[ix2], 'box_score': score,
                                                      'box_type': 'det', 'box_pred_class_id': class_ids[ix2]})
 
     # create and fill results dictionary.
     results_dict = {'boxes': box_results_list}
     if seg_logits is None:
         # output dummy segmentation for retina_net.
         results_dict['seg_preds'] = np.zeros(img_shape)[:, 0][:, np.newaxis]
     else:
         # output label maps for retina_unet.
         results_dict['seg_preds'] = F.softmax(seg_logits, 1).argmax(1).cpu().data.numpy()[:, np.newaxis].astype('uint8')
 
     return results_dict
 
 
 ############################################################
 #  Mask R-CNN Class
 ############################################################
 
 class net(nn.Module):
 
 
     def __init__(self, cf, logger):
 
         super(net, self).__init__()
         self.cf = cf
         self.logger = logger
         self.build()
 
         if self.cf.weight_init is not None:
             logger.info("using pytorch weight init of type {}".format(self.cf.weight_init))
             mutils.initialize_weights(self)
         else:
             logger.info("using default pytorch weight init")
 
 
     def build(self):
         """Build Mask R-CNN architecture."""
 
         # Image size must be dividable by 2 multiple times.
         h, w = self.cf.patch_size[:2]
         if h / 2**5 != int(h / 2**5) or w / 2**5 != int(w / 2**5):
             raise Exception("Image size must be dividable by 2 at least 5 times "
                             "to avoid fractions when downscaling and upscaling."
                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
 
         # instanciate abstract multi dimensional conv class and backbone class.
         conv = mutils.NDConvGenerator(self.cf.dim)
         backbone = utils.import_module('bbone', self.cf.backbone_path)
 
         # build Anchors, FPN, RPN, Classifier / Bbox-Regressor -head, Mask-head
         self.np_anchors = mutils.generate_pyramid_anchors(self.logger, self.cf)
         self.anchors = torch.from_numpy(self.np_anchors).float().cuda()
         self.fpn = backbone.FPN(self.cf, conv, operate_stride1=True)
         self.rpn = RPN(self.cf, conv)
         self.classifier = Classifier(self.cf, conv)
         self.mask = Mask(self.cf, conv)
         self.final_conv = conv(self.cf.end_filts, self.cf.num_seg_classes, ks=1, pad=0, norm=self.cf.norm, relu=None)
 
 
     def train_forward(self, batch, is_validation=False):
         """
         train method (also used for validation monitoring). wrapper around forward pass of network. prepares input data
         for processing, computes losses, and stores outputs in a dictionary.
         :param batch: dictionary containing 'data', 'seg', etc.
         :return: results_dict: dictionary with keys:
                 'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                         [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                 'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, n_classes].
                 'torch_loss': 1D torch tensor for backprop.
                 'class_loss': classification loss for monitoring.
         """
         img = batch['data']
         gt_class_ids = batch['roi_labels']
         gt_boxes = batch['bb_target']
         axes = (0, 2, 3, 1) if self.cf.dim == 2 else (0, 2, 3, 4, 1)
         var_seg_ohe = torch.FloatTensor(mutils.get_one_hot_encoding(batch['seg'], self.cf.num_seg_classes)).cuda()
         var_seg = torch.LongTensor(batch['seg']).cuda()
 
 
         img = torch.from_numpy(img).float().cuda()
         batch_rpn_class_loss = torch.FloatTensor([0]).cuda()
         batch_rpn_bbox_loss = torch.FloatTensor([0]).cuda()
 
         # list of output boxes for monitoring/plotting. each element is a list of boxes per batch element.
         box_results_list = [[] for _ in range(img.shape[0])]
 
         #forward passes. 1. general forward pass, where no activations are saved in second stage (for performance
         # monitoring and loss sampling). 2. second stage forward pass of sampled rois with stored activations for backprop.
         rpn_class_logits, rpn_pred_deltas, proposal_boxes, detections, seg_logits = self.forward(img)
         mrcnn_class_logits, mrcnn_pred_deltas, target_class_ids, mrcnn_target_deltas,  \
         sample_proposals = self.loss_samples_forward(gt_class_ids, gt_boxes)
 
         # loop over batch
         for b in range(img.shape[0]):
             if len(gt_boxes[b]) > 0:
 
                 # add gt boxes to output list for monitoring.
                 for ix in range(len(gt_boxes[b])):
                     box_results_list[b].append({'box_coords': batch['bb_target'][b][ix],
                                                 'box_label': batch['roi_labels'][b][ix], 'box_type': 'gt'})
 
                 # match gt boxes with anchors to generate targets for RPN losses.
                 rpn_match, rpn_target_deltas = mutils.gt_anchor_matching(self.cf, self.np_anchors, gt_boxes[b])
 
                 # add positive anchors used for loss to output list for monitoring.
                 pos_anchors = mutils.clip_boxes_numpy(self.np_anchors[np.argwhere(rpn_match == 1)][:, 0], img.shape[2:])
                 for p in pos_anchors:
                     box_results_list[b].append({'box_coords': p, 'box_type': 'pos_anchor'})
 
             else:
                 rpn_match = np.array([-1]*self.np_anchors.shape[0])
                 rpn_target_deltas = np.array([0])
 
-            rpn_match = torch.from_numpy(rpn_match).cuda()
+            rpn_match_gpu = torch.from_numpy(rpn_match).cuda()
             rpn_target_deltas = torch.from_numpy(rpn_target_deltas).float().cuda()
 
             # compute RPN losses.
-            rpn_class_loss, neg_anchor_ix = compute_rpn_class_loss(rpn_match, rpn_class_logits[b], self.cf.shem_poolsize)
-            rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas[b], rpn_match)
+            rpn_class_loss, neg_anchor_ix = compute_rpn_class_loss(rpn_match_gpu, rpn_class_logits[b], self.cf.shem_poolsize)
+            rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_deltas, rpn_pred_deltas[b], rpn_match_gpu)
             batch_rpn_class_loss += rpn_class_loss / img.shape[0]
             batch_rpn_bbox_loss += rpn_bbox_loss / img.shape[0]
 
             # add negative anchors used for loss to output list for monitoring.
-            neg_anchors = mutils.clip_boxes_numpy(self.np_anchors[np.argwhere(rpn_match == -1)][0, neg_anchor_ix], img.shape[2:])
+            neg_anchors = mutils.clip_boxes_numpy(self.np_anchors[rpn_match == -1][neg_anchor_ix], img.shape[2:])
             for n in neg_anchors:
                 box_results_list[b].append({'box_coords': n, 'box_type': 'neg_anchor'})
 
             # add highest scoring proposals to output list for monitoring.
             rpn_proposals = proposal_boxes[b][proposal_boxes[b, :, -1].argsort()][::-1]
             for r in rpn_proposals[:self.cf.n_plot_rpn_props, :-1]:
                 box_results_list[b].append({'box_coords': r, 'box_type': 'prop'})
 
         # add positive and negative roi samples used for mrcnn losses to output list for monitoring.
         if 0 not in sample_proposals.shape:
             rois = mutils.clip_to_window(self.cf.window, sample_proposals).cpu().data.numpy()
             for ix, r in enumerate(rois):
                 box_results_list[int(r[-1])].append({'box_coords': r[:-1] * self.cf.scale,
                                             'box_type': 'pos_class' if target_class_ids[ix] > 0 else 'neg_class'})
 
         batch_rpn_class_loss = batch_rpn_class_loss
         batch_rpn_bbox_loss = batch_rpn_bbox_loss
 
         # compute mrcnn losses.
         mrcnn_class_loss = compute_mrcnn_class_loss(target_class_ids, mrcnn_class_logits)
         mrcnn_bbox_loss = compute_mrcnn_bbox_loss(mrcnn_target_deltas, mrcnn_pred_deltas, target_class_ids)
 
         # mrcnn can be run without pixelwise annotations available (Faster R-CNN mode).
         # In this case, the mask_loss is taken out of training.
         # if not self.cf.frcnn_mode:
         #     mrcnn_mask_loss = compute_mrcnn_mask_loss(target_mask, mrcnn_pred_mask, target_class_ids)
         # else:
         #     mrcnn_mask_loss = torch.FloatTensor([0]).cuda()
 
         seg_loss_dice = 1 - mutils.batch_dice(F.softmax(seg_logits, dim=1), var_seg_ohe)
         seg_loss_ce = F.cross_entropy(seg_logits, var_seg[:, 0])
 
         loss = batch_rpn_class_loss + batch_rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + (seg_loss_dice + seg_loss_ce) / 2
 
         # monitor RPN performance: detection count = the number of correctly matched proposals per fg-class.
         dcount = [list(target_class_ids.cpu().data.numpy()).count(c) for c in np.arange(self.cf.head_classes)[1:]]
 
         # run unmolding of predictions for monitoring and merge all results to one dictionary.
         results_dict = get_results(self.cf, img.shape, detections, seg_logits, box_results_list)
         results_dict['torch_loss'] = loss
         results_dict['monitor_values'] = {'loss': loss.item(), 'class_loss': mrcnn_class_loss.item()}
         results_dict['logger_string'] = "loss: {0:.2f}, rpn_class: {1:.2f}, rpn_bbox: {2:.2f}, mrcnn_class: {3:.2f}, " \
                                         "mrcnn_bbox: {4:.2f}, dice_loss: {5:.2f}, dcount {6}"\
             .format(loss.item(), batch_rpn_class_loss.item(), batch_rpn_bbox_loss.item(), mrcnn_class_loss.item(),
                     mrcnn_bbox_loss.item(), seg_loss_dice.item(), dcount)
 
         return results_dict
 
 
     def test_forward(self, batch, return_masks=True):
         """
         test method. wrapper around forward pass of network without usage of any ground truth information.
         prepares input data for processing and stores outputs in a dictionary.
         :param batch: dictionary containing 'data'
         :param return_masks: boolean. If True, full resolution masks are returned for all proposals (speed trade-off).
         :return: results_dict: dictionary with keys:
                'boxes': list over batch elements. each batch element is a list of boxes. each box is a dictionary:
                        [[{box_0}, ... {box_n}], [{box_0}, ... {box_n}], ...]
                'seg_preds': pixel-wise class predictions (b, 1, y, x, (z)) with values [0, n_classes]
         """
         img = batch['data']
         img = torch.from_numpy(img).float().cuda()
         _, _, _, detections, seg_logits = self.forward(img)
         results_dict = get_results(self.cf, img.shape, detections, seg_logits)
         return results_dict
 
 
     def forward(self, img, is_training=True):
         """
         :param img: input images (b, c, y, x, (z)).
         :return: rpn_pred_logits: (b, n_anchors, 2)
         :return: rpn_pred_deltas: (b, n_anchors, (y, x, (z), log(h), log(w), (log(d))))
         :return: batch_proposal_boxes: (b, n_proposals, (y1, x1, y2, x2, (z1), (z2), batch_ix)) only for monitoring/plotting.
         :return: detections: (n_final_detections, (y1, x1, y2, x2, (z1), (z2), batch_ix, pred_class_id, pred_score)
         :return: detection_masks: (n_final_detections, n_classes, y, x, (z)) raw molded masks as returned by mask-head.
         """
         # extract features.
         fpn_outs = self.fpn(img)
         seg_logits = self.final_conv(fpn_outs[0])
         rpn_feature_maps = [fpn_outs[i + 1] for i in self.cf.pyramid_levels]
         self.mrcnn_feature_maps = rpn_feature_maps
 
         # loop through pyramid layers and apply RPN.
         layer_outputs = []  # list of lists
         for p in rpn_feature_maps:
             layer_outputs.append(self.rpn(p))
 
         # concatenate layer outputs.
         # convert from list of lists of level outputs to list of lists of outputs across levels.
         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
         outputs = list(zip(*layer_outputs))
         outputs = [torch.cat(list(o), dim=1) for o in outputs]
         rpn_pred_logits, rpn_pred_probs, rpn_pred_deltas = outputs
 
         # generate proposals: apply predicted deltas to anchors and filter by foreground scores from RPN classifier.
         proposal_count = self.cf.post_nms_rois_training if is_training else self.cf.post_nms_rois_inference
-        batch_rpn_rois, batch_proposal_boxes = proposal_layer(rpn_pred_probs, rpn_pred_deltas, proposal_count, self.anchors, self.cf)
+        batch_rpn_rois, batch_proposal_boxes = refine_proposals(rpn_pred_probs, rpn_pred_deltas, proposal_count, self.anchors, self.cf)
 
         # merge batch dimension of proposals while storing allocation info in coordinate dimension.
         batch_ixs = torch.from_numpy(np.repeat(np.arange(batch_rpn_rois.shape[0]), batch_rpn_rois.shape[1])).float().cuda()
         rpn_rois = batch_rpn_rois.view(-1, batch_rpn_rois.shape[2])
         self.rpn_rois_batch_info = torch.cat((rpn_rois, batch_ixs.unsqueeze(1)), dim=1)
 
         # this is the first of two forward passes in the second stage, where no activations are stored for backprop.
         # here, all proposals are forwarded (with virtual_batch_size = batch_size * post_nms_rois.)
         # for inference/monitoring as well as sampling of rois for the loss functions.
         # processed in chunks of roi_chunk_size to re-adjust to gpu-memory.
         chunked_rpn_rois = self.rpn_rois_batch_info.split(self.cf.roi_chunk_size)
         class_logits_list, bboxes_list = [], []
         with torch.no_grad():
             for chunk in chunked_rpn_rois:
                 chunk_class_logits, chunk_bboxes = self.classifier(self.mrcnn_feature_maps, chunk)
                 class_logits_list.append(chunk_class_logits)
                 bboxes_list.append(chunk_bboxes)
         batch_mrcnn_class_logits = torch.cat(class_logits_list, 0)
         batch_mrcnn_bbox = torch.cat(bboxes_list, 0)
         self.batch_mrcnn_class_scores = F.softmax(batch_mrcnn_class_logits, dim=1)
 
         # refine classified proposals, filter and return final detections.
-        detections = refine_detections(rpn_rois, self.batch_mrcnn_class_scores, batch_mrcnn_bbox, batch_ixs, self.cf, )
+        detections = refine_detections(self.cf, batch_ixs, rpn_rois, batch_mrcnn_bbox, batch_ixs)
 
         return [rpn_pred_logits, rpn_pred_deltas, batch_proposal_boxes, detections, seg_logits]
 
 
     def loss_samples_forward(self, batch_gt_class_ids, batch_gt_boxes):
         """
         this is the second forward pass through the second stage (features from stage one are re-used).
         samples few rois in detection_target_layer and forwards only those for loss computation.
         :param batch_gt_class_ids: list over batch elements. Each element is a list over the corresponding roi target labels.
         :param batch_gt_boxes: list over batch elements. Each element is a list over the corresponding roi target coordinates.
         :param batch_gt_masks: list over batch elements. Each element is binary mask of shape (n_gt_rois, y, x, (z), c)
         :return: sample_logits: (n_sampled_rois, n_classes) predicted class scores.
         :return: sample_boxes: (n_sampled_rois, n_classes, 2 * dim) predicted corrections to be applied to proposals for refinement.
         :return: sample_mask: (n_sampled_rois, n_classes, y, x, (z)) predicted masks per class and proposal.
         :return: sample_target_class_ids: (n_sampled_rois) target class labels of sampled proposals.
         :return: sample_target_deltas: (n_sampled_rois, 2 * dim) target deltas of sampled proposals for box refinement.
         :return: sample_target_masks: (n_sampled_rois, y, x, (z)) target masks of sampled proposals.
         :return: sample_proposals: (n_sampled_rois, 2 * dim) RPN output for sampled proposals. only for monitoring/plotting.
         """
         # sample rois for loss and get corresponding targets for all Mask R-CNN head network losses.
         sample_ix, sample_target_class_ids, sample_target_deltas = \
             detection_target_layer(self.rpn_rois_batch_info, self.batch_mrcnn_class_scores,
                                    batch_gt_class_ids, batch_gt_boxes, self.cf)
 
         # re-use feature maps and RPN output from first forward pass.
         sample_proposals = self.rpn_rois_batch_info[sample_ix]
         if 0 not in sample_proposals.size():
             sample_logits, sample_boxes = self.classifier(self.mrcnn_feature_maps, sample_proposals)
         else:
             sample_logits = torch.FloatTensor().cuda()
             sample_boxes = torch.FloatTensor().cuda()
 
         return [sample_logits, sample_boxes, sample_target_class_ids, sample_target_deltas, sample_proposals]
diff --git a/utils/model_utils.py b/utils/model_utils.py
index 0453d9a..dfd40d3 100644
--- a/utils/model_utils.py
+++ b/utils/model_utils.py
@@ -1,891 +1,891 @@
 #!/usr/bin/env python
 # Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """
 Parts are based on https://github.com/multimodallearning/pytorch-mask-rcnn
 published under MIT license.
 """
 
 import numpy as np
 import scipy.misc
 import scipy.ndimage
 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 
 
 ############################################################
 #  Bounding Boxes
 ############################################################
 
 
 def compute_iou_2D(box, boxes, box_area, boxes_area):
     """Calculates IoU of the given box with the array of the given boxes.
     box: 1D vector [y1, x1, y2, x2] THIS IS THE GT BOX
     boxes: [boxes_count, (y1, x1, y2, x2)]
     box_area: float. the area of 'box'
     boxes_area: array of length boxes_count.
 
     Note: the areas are passed in rather than calculated here for
           efficency. Calculate once in the caller to avoid duplicate work.
     """
     # Calculate intersection areas
     y1 = np.maximum(box[0], boxes[:, 0])
     y2 = np.minimum(box[2], boxes[:, 2])
     x1 = np.maximum(box[1], boxes[:, 1])
     x2 = np.minimum(box[3], boxes[:, 3])
     intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
     union = box_area + boxes_area[:] - intersection[:]
     iou = intersection / union
 
     return iou
 
 
 
 def compute_iou_3D(box, boxes, box_volume, boxes_volume):
     """Calculates IoU of the given box with the array of the given boxes.
     box: 1D vector [y1, x1, y2, x2, z1, z2] (typically gt box)
     boxes: [boxes_count, (y1, x1, y2, x2, z1, z2)]
     box_area: float. the area of 'box'
     boxes_area: array of length boxes_count.
 
     Note: the areas are passed in rather than calculated here for
           efficency. Calculate once in the caller to avoid duplicate work.
     """
     # Calculate intersection areas
     y1 = np.maximum(box[0], boxes[:, 0])
     y2 = np.minimum(box[2], boxes[:, 2])
     x1 = np.maximum(box[1], boxes[:, 1])
     x2 = np.minimum(box[3], boxes[:, 3])
     z1 = np.maximum(box[4], boxes[:, 4])
     z2 = np.minimum(box[5], boxes[:, 5])
     intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0) * np.maximum(z2 - z1, 0)
     union = box_volume + boxes_volume[:] - intersection[:]
     iou = intersection / union
 
     return iou
 
 
 
 def compute_overlaps(boxes1, boxes2):
     """Computes IoU overlaps between two sets of boxes.
     boxes1, boxes2: [N, (y1, x1, y2, x2)]. / 3D: (z1, z2))
     For better performance, pass the largest set first and the smaller second.
     """
     # Areas of anchors and GT boxes
     if boxes1.shape[1] == 4:
         area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
         area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
         # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
         # Each cell contains the IoU value.
         overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
         for i in range(overlaps.shape[1]):
             box2 = boxes2[i] #this is the gt box
             overlaps[:, i] = compute_iou_2D(box2, boxes1, area2[i], area1)
         return overlaps
 
     else:
         # Areas of anchors and GT boxes
         volume1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1]) * (boxes1[:, 5] - boxes1[:, 4])
         volume2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1]) * (boxes2[:, 5] - boxes2[:, 4])
         # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
         # Each cell contains the IoU value.
         overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
         for i in range(overlaps.shape[1]):
             box2 = boxes2[i]  # this is the gt box
             overlaps[:, i] = compute_iou_3D(box2, boxes1, volume2[i], volume1)
         return overlaps
 
 
 
 def box_refinement(box, gt_box):
     """Compute refinement needed to transform box to gt_box.
     box and gt_box are [N, (y1, x1, y2, x2)] / 3D: (z1, z2))
     """
     height = box[:, 2] - box[:, 0]
     width = box[:, 3] - box[:, 1]
     center_y = box[:, 0] + 0.5 * height
     center_x = box[:, 1] + 0.5 * width
 
     gt_height = gt_box[:, 2] - gt_box[:, 0]
     gt_width = gt_box[:, 3] - gt_box[:, 1]
     gt_center_y = gt_box[:, 0] + 0.5 * gt_height
     gt_center_x = gt_box[:, 1] + 0.5 * gt_width
 
     dy = (gt_center_y - center_y) / height
     dx = (gt_center_x - center_x) / width
     dh = torch.log(gt_height / height)
     dw = torch.log(gt_width / width)
     result = torch.stack([dy, dx, dh, dw], dim=1)
 
     if box.shape[1] > 4:
         depth = box[:, 5] - box[:, 4]
         center_z = box[:, 4] + 0.5 * depth
         gt_depth = gt_box[:, 5] - gt_box[:, 4]
         gt_center_z = gt_box[:, 4] + 0.5 * gt_depth
         dz = (gt_center_z - center_z) / depth
         dd = torch.log(gt_depth / depth)
         result = torch.stack([dy, dx, dz, dh, dw, dd], dim=1)
 
     return result
 
 
 
 def unmold_mask_2D(mask, bbox, image_shape):
     """Converts a mask generated by the neural network into a format similar
     to it's original shape.
     mask: [height, width] of type float. A small, typically 28x28 mask.
     bbox: [y1, x1, y2, x2]. The box to fit the mask in.
 
     Returns a binary mask with the same size as the original image.
     """
     y1, x1, y2, x2 = bbox
     out_zoom = [y2 - y1, x2 - x1]
     zoom_factor = [i / j for i, j in zip(out_zoom, mask.shape)]
     mask = scipy.ndimage.zoom(mask, zoom_factor, order=1).astype(np.float32)
 
     # Put the mask in the right location.
     full_mask = np.zeros(image_shape[:2])
     full_mask[y1:y2, x1:x2] = mask
     return full_mask
 
 
 
 def unmold_mask_3D(mask, bbox, image_shape):
     """Converts a mask generated by the neural network into a format similar
     to it's original shape.
     mask: [height, width] of type float. A small, typically 28x28 mask.
     bbox: [y1, x1, y2, x2, z1, z2]. The box to fit the mask in.
 
     Returns a binary mask with the same size as the original image.
     """
     y1, x1, y2, x2, z1, z2 = bbox
     out_zoom = [y2 - y1, x2 - x1, z2 - z1]
     zoom_factor = [i/j for i,j in zip(out_zoom, mask.shape)]
     mask = scipy.ndimage.zoom(mask, zoom_factor, order=1).astype(np.float32)
 
     # Put the mask in the right location.
     full_mask = np.zeros(image_shape[:3])
     full_mask[y1:y2, x1:x2, z1:z2] = mask
     return full_mask
 
 
 ############################################################
 #  Anchors
 ############################################################
 
 def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):
     """
     scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
     ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
     shape: [height, width] spatial shape of the feature map over which
             to generate anchors.
     feature_stride: Stride of the feature map relative to the image in pixels.
     anchor_stride: Stride of anchors on the feature map. For example, if the
         value is 2 then generate anchors for every other feature map pixel.
     """
     # Get all combinations of scales and ratios
     scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
     scales = scales.flatten()
     ratios = ratios.flatten()
 
     # Enumerate heights and widths from scales and ratios
     heights = scales / np.sqrt(ratios)
     widths = scales * np.sqrt(ratios)
 
     # Enumerate shifts in feature space
     shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
     shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
     shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
 
     # Enumerate combinations of shifts, widths, and heights
     box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
     box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
 
     # Reshape to get a list of (y, x) and a list of (h, w)
     box_centers = np.stack(
         [box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
     box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
 
     # Convert to corner coordinates (y1, x1, y2, x2)
     boxes = np.concatenate([box_centers - 0.5 * box_sizes,
                             box_centers + 0.5 * box_sizes], axis=1)
     return boxes
 
 
 
 def generate_anchors_3D(scales_xy, scales_z, ratios, shape, feature_stride_xy, feature_stride_z, anchor_stride):
     """
     scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
     ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
     shape: [height, width] spatial shape of the feature map over which
             to generate anchors.
     feature_stride: Stride of the feature map relative to the image in pixels.
     anchor_stride: Stride of anchors on the feature map. For example, if the
         value is 2 then generate anchors for every other feature map pixel.
     """
     # Get all combinations of scales and ratios
 
     scales_xy, ratios_meshed = np.meshgrid(np.array(scales_xy), np.array(ratios))
     scales_xy = scales_xy.flatten()
     ratios_meshed = ratios_meshed.flatten()
 
     # Enumerate heights and widths from scales and ratios
     heights = scales_xy / np.sqrt(ratios_meshed)
     widths = scales_xy * np.sqrt(ratios_meshed)
     depths = np.tile(np.array(scales_z), len(ratios_meshed)//np.array(scales_z)[..., None].shape[0])
 
     # Enumerate shifts in feature space
     shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride_xy #translate from fm positions to input coords.
     shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride_xy
     shifts_z = np.arange(0, shape[2], anchor_stride) * (feature_stride_z)
     shifts_x, shifts_y, shifts_z = np.meshgrid(shifts_x, shifts_y, shifts_z)
 
     # Enumerate combinations of shifts, widths, and heights
     box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
     box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
     box_depths, box_centers_z = np.meshgrid(depths, shifts_z)
 
     # Reshape to get a list of (y, x, z) and a list of (h, w, d)
     box_centers = np.stack(
         [box_centers_y, box_centers_x, box_centers_z], axis=2).reshape([-1, 3])
     box_sizes = np.stack([box_heights, box_widths, box_depths], axis=2).reshape([-1, 3])
 
     # Convert to corner coordinates (y1, x1, y2, x2, z1, z2)
     boxes = np.concatenate([box_centers - 0.5 * box_sizes,
                             box_centers + 0.5 * box_sizes], axis=1)
 
     boxes = np.transpose(np.array([boxes[:, 0], boxes[:, 1], boxes[:, 3], boxes[:, 4], boxes[:, 2], boxes[:, 5]]), axes=(1, 0))
     return boxes
 
 
 def generate_pyramid_anchors(logger, cf):
     """Generate anchors at different levels of a feature pyramid. Each scale
     is associated with a level of the pyramid, but each ratio is used in
     all levels of the pyramid.
 
     from configs:
     :param scales: cf.RPN_ANCHOR_SCALES , e.g. [4, 8, 16, 32]
     :param ratios: cf.RPN_ANCHOR_RATIOS , e.g. [0.5, 1, 2]
     :param feature_shapes: cf.BACKBONE_SHAPES , e.g.  [array of shapes per feature map] [80, 40, 20, 10, 5]
     :param feature_strides: cf.BACKBONE_STRIDES , e.g. [2, 4, 8, 16, 32, 64]
     :param anchors_stride: cf.RPN_ANCHOR_STRIDE , e.g. 1
     :return anchors: (N, (y1, x1, y2, x2, (z1), (z2)). All generated anchors in one array. Sorted
     with the same order of the given scales. So, anchors of scale[0] come first, then anchors of scale[1], and so on.
     """
     scales = cf.rpn_anchor_scales
     ratios = cf.rpn_anchor_ratios
     feature_shapes = cf.backbone_shapes
     anchor_stride = cf.rpn_anchor_stride
     pyramid_levels = cf.pyramid_levels
     feature_strides = cf.backbone_strides
 
     anchors = []
     logger.info("feature map shapes: {}".format(feature_shapes))
     logger.info("anchor scales: {}".format(scales))
 
     expected_anchors = [np.prod(feature_shapes[ii]) * len(ratios) * len(scales['xy'][ii]) for ii in pyramid_levels]
 
     for lix, level in enumerate(pyramid_levels):
         if len(feature_shapes[level]) == 2:
             anchors.append(generate_anchors(scales['xy'][level], ratios, feature_shapes[level],
                                             feature_strides['xy'][level], anchor_stride))
         else:
             anchors.append(generate_anchors_3D(scales['xy'][level], scales['z'][level], ratios, feature_shapes[level],
                                             feature_strides['xy'][level], feature_strides['z'][level], anchor_stride))
 
         logger.info("level {}: built anchors {} / expected anchors {} ||| total build {} / total expected {}".format(
             level, anchors[-1].shape, expected_anchors[lix], np.concatenate(anchors).shape, np.sum(expected_anchors)))
 
     out_anchors = np.concatenate(anchors, axis=0)
     return out_anchors
 
 
 
 def apply_box_deltas_2D(boxes, deltas):
     """Applies the given deltas to the given boxes.
     boxes: [N, 4] where each row is y1, x1, y2, x2
     deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
     """
     # Convert to y, x, h, w
     height = boxes[:, 2] - boxes[:, 0]
     width = boxes[:, 3] - boxes[:, 1]
     center_y = boxes[:, 0] + 0.5 * height
     center_x = boxes[:, 1] + 0.5 * width
     # Apply deltas
     center_y += deltas[:, 0] * height
     center_x += deltas[:, 1] * width
     height *= torch.exp(deltas[:, 2])
     width *= torch.exp(deltas[:, 3])
     # Convert back to y1, x1, y2, x2
     y1 = center_y - 0.5 * height
     x1 = center_x - 0.5 * width
     y2 = y1 + height
     x2 = x1 + width
     result = torch.stack([y1, x1, y2, x2], dim=1)
     return result
 
 
 
 def apply_box_deltas_3D(boxes, deltas):
     """Applies the given deltas to the given boxes.
     boxes: [N, 6] where each row is y1, x1, y2, x2, z1, z2
     deltas: [N, 6] where each row is [dy, dx, dz, log(dh), log(dw), log(dd)]
     """
     # Convert to y, x, h, w
     height = boxes[:, 2] - boxes[:, 0]
     width = boxes[:, 3] - boxes[:, 1]
     depth = boxes[:, 5] - boxes[:, 4]
     center_y = boxes[:, 0] + 0.5 * height
     center_x = boxes[:, 1] + 0.5 * width
     center_z = boxes[:, 4] + 0.5 * depth
     # Apply deltas
     center_y += deltas[:, 0] * height
     center_x += deltas[:, 1] * width
     center_z += deltas[:, 2] * depth
     height *= torch.exp(deltas[:, 3])
     width *= torch.exp(deltas[:, 4])
     depth *= torch.exp(deltas[:, 5])
     # Convert back to y1, x1, y2, x2
     y1 = center_y - 0.5 * height
     x1 = center_x - 0.5 * width
     z1 = center_z - 0.5 * depth
     y2 = y1 + height
     x2 = x1 + width
     z2 = z1 + depth
     result = torch.stack([y1, x1, y2, x2, z1, z2], dim=1)
     return result
 
 
 
 def clip_boxes_2D(boxes, window):
     """
     boxes: [N, 4] each col is y1, x1, y2, x2
     window: [4] in the form y1, x1, y2, x2
     """
     boxes = torch.stack( \
         [boxes[:, 0].clamp(float(window[0]), float(window[2])),
          boxes[:, 1].clamp(float(window[1]), float(window[3])),
          boxes[:, 2].clamp(float(window[0]), float(window[2])),
          boxes[:, 3].clamp(float(window[1]), float(window[3]))], 1)
     return boxes
 
 def clip_boxes_3D(boxes, window):
     """
     boxes: [N, 6] each col is y1, x1, y2, x2, z1, z2
     window: [6] in the form y1, x1, y2, x2, z1, z2
     """
     boxes = torch.stack( \
         [boxes[:, 0].clamp(float(window[0]), float(window[2])),
          boxes[:, 1].clamp(float(window[1]), float(window[3])),
          boxes[:, 2].clamp(float(window[0]), float(window[2])),
          boxes[:, 3].clamp(float(window[1]), float(window[3])),
          boxes[:, 4].clamp(float(window[4]), float(window[5])),
          boxes[:, 5].clamp(float(window[4]), float(window[5]))], 1)
     return boxes
 
 
 
 def clip_boxes_numpy(boxes, window):
     """
     boxes: [N, 4] each col is y1, x1, y2, x2 / [N, 6] in 3D.
     window: iamge shape (y, x, (z))
     """
     if boxes.shape[1] == 4:
         boxes = np.concatenate(
             (np.clip(boxes[:, 0], 0, window[0])[:, None],
             np.clip(boxes[:, 1], 0, window[0])[:, None],
             np.clip(boxes[:, 2], 0, window[1])[:, None],
             np.clip(boxes[:, 3], 0, window[1])[:, None]), 1
         )
 
     else:
         boxes = np.concatenate(
             (np.clip(boxes[:, 0], 0, window[0])[:, None],
              np.clip(boxes[:, 1], 0, window[0])[:, None],
              np.clip(boxes[:, 2], 0, window[1])[:, None],
              np.clip(boxes[:, 3], 0, window[1])[:, None],
              np.clip(boxes[:, 4], 0, window[2])[:, None],
              np.clip(boxes[:, 5], 0, window[2])[:, None]), 1
         )
 
     return boxes
 
 
 
 def bbox_overlaps_2D(boxes1, boxes2):
     """Computes IoU overlaps between two sets of boxes.
     boxes1, boxes2: [N, (y1, x1, y2, x2)].
     """
     # 1. Tile boxes2 and repeate boxes1. This allows us to compare
     # every boxes1 against every boxes2 without loops.
     # TF doesn't have an equivalent to np.repeate() so simulate it
     # using tf.tile() and tf.reshape.
     boxes1_repeat = boxes2.size()[0]
     boxes2_repeat = boxes1.size()[0]
     boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,4)
     boxes2 = boxes2.repeat(boxes2_repeat,1)
 
     # 2. Compute intersections
     b1_y1, b1_x1, b1_y2, b1_x2 = boxes1.chunk(4, dim=1)
     b2_y1, b2_x1, b2_y2, b2_x2 = boxes2.chunk(4, dim=1)
     y1 = torch.max(b1_y1, b2_y1)[:, 0]
     x1 = torch.max(b1_x1, b2_x1)[:, 0]
     y2 = torch.min(b1_y2, b2_y2)[:, 0]
     x2 = torch.min(b1_x2, b2_x2)[:, 0]
     zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)
     if y1.is_cuda:
         zeros = zeros.cuda()
     intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros)
 
     # 3. Compute unions
     b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
     b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
     union = b1_area[:,0] + b2_area[:,0] - intersection
 
     # 4. Compute IoU and reshape to [boxes1, boxes2]
     iou = intersection / union
     overlaps = iou.view(boxes2_repeat, boxes1_repeat)
     return overlaps
 
 
 
 def bbox_overlaps_3D(boxes1, boxes2):
     """Computes IoU overlaps between two sets of boxes.
     boxes1, boxes2: [N, (y1, x1, y2, x2, z1, z2)].
     """
     # 1. Tile boxes2 and repeate boxes1. This allows us to compare
     # every boxes1 against every boxes2 without loops.
     # TF doesn't have an equivalent to np.repeate() so simulate it
     # using tf.tile() and tf.reshape.
     boxes1_repeat = boxes2.size()[0]
     boxes2_repeat = boxes1.size()[0]
     boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,6)
     boxes2 = boxes2.repeat(boxes2_repeat,1)
 
     # 2. Compute intersections
     b1_y1, b1_x1, b1_y2, b1_x2, b1_z1, b1_z2 = boxes1.chunk(6, dim=1)
     b2_y1, b2_x1, b2_y2, b2_x2, b2_z1, b2_z2 = boxes2.chunk(6, dim=1)
     y1 = torch.max(b1_y1, b2_y1)[:, 0]
     x1 = torch.max(b1_x1, b2_x1)[:, 0]
     y2 = torch.min(b1_y2, b2_y2)[:, 0]
     x2 = torch.min(b1_x2, b2_x2)[:, 0]
     z1 = torch.max(b1_z1, b2_z1)[:, 0]
     z2 = torch.min(b1_z2, b2_z2)[:, 0]
     zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)
     if y1.is_cuda:
         zeros = zeros.cuda()
     intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros) * torch.max(z2 - z1, zeros)
 
     # 3. Compute unions
     b1_volume = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)  * (b1_z2 - b1_z1)
     b2_volume = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)  * (b2_z2 - b2_z1)
     union = b1_volume[:,0] + b2_volume[:,0] - intersection
 
     # 4. Compute IoU and reshape to [boxes1, boxes2]
     iou = intersection / union
     overlaps = iou.view(boxes2_repeat, boxes1_repeat)
     return overlaps
 
 
 
 def gt_anchor_matching(cf, anchors, gt_boxes, gt_class_ids=None):
     """Given the anchors and GT boxes, compute overlaps and identify positive
     anchors and deltas to refine them to match their corresponding GT boxes.
 
     anchors: [num_anchors, (y1, x1, y2, x2, (z1), (z2))]
     gt_boxes: [num_gt_boxes, (y1, x1, y2, x2, (z1), (z2))]
     gt_class_ids (optional): [num_gt_boxes] Integer class IDs for one stage detectors. in RPN case of Mask R-CNN,
     set all positive matches to 1 (foreground)
 
     Returns:
     anchor_class_matches: [N] (int32) matches between anchors and GT boxes.
                1 = positive anchor, -1 = negative anchor, 0 = neutral.
                In case of one stage detectors like RetinaNet/RetinaUNet this flag takes
                class_ids as positive anchor values, i.e. values >= 1!
     anchor_delta_targets: [N, (dy, dx, (dz), log(dh), log(dw), (log(dd)))] Anchor bbox deltas.
     """
 
     anchor_class_matches = np.zeros([anchors.shape[0]], dtype=np.int32)
     anchor_delta_targets = np.zeros((cf.rpn_train_anchors_per_image, 2*cf.dim))
     anchor_matching_iou = cf.anchor_matching_iou
 
     if gt_boxes is None:
         anchor_class_matches = np.full(anchor_class_matches.shape, fill_value=-1)
         return anchor_class_matches, anchor_delta_targets
 
     # for mrcnn: anchor matching is done for RPN loss, so positive labels are all 1 (foreground)
     if gt_class_ids is None:
         gt_class_ids = np.array([1] * len(gt_boxes))
 
     # Compute overlaps [num_anchors, num_gt_boxes]
     overlaps = compute_overlaps(anchors, gt_boxes)
 
     # Match anchors to GT Boxes
     # If an anchor overlaps a GT box with IoU >= anchor_matching_iou then it's positive.
     # If an anchor overlaps a GT box with IoU < 0.1 then it's negative.
     # Neutral anchors are those that don't match the conditions above,
     # and they don't influence the loss function.
     # However, don't keep any GT box unmatched (rare, but happens). Instead,
     # match it to the closest anchor (even if its max IoU is < 0.1).
 
     # 1. Set negative anchors first. They get overwritten below if a GT box is
     # matched to them. Skip boxes in crowd areas.
     anchor_iou_argmax = np.argmax(overlaps, axis=1)
     anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
     if anchors.shape[1] == 4:
         anchor_class_matches[(anchor_iou_max < 0.1)] = -1
     elif anchors.shape[1] == 6:
         anchor_class_matches[(anchor_iou_max < 0.01)] = -1
     else:
         raise ValueError('anchor shape wrong {}'.format(anchors.shape))
 
     # 2. Set an anchor for each GT box (regardless of IoU value).
     gt_iou_argmax = np.argmax(overlaps, axis=0)
     for ix, ii in enumerate(gt_iou_argmax):
         anchor_class_matches[ii] = gt_class_ids[ix]
 
     # 3. Set anchors with high overlap as positive.
     above_trhesh_ixs = np.argwhere(anchor_iou_max >= anchor_matching_iou)
     anchor_class_matches[above_trhesh_ixs] = gt_class_ids[anchor_iou_argmax[above_trhesh_ixs]]
 
     # Subsample to balance positive anchors.
     ids = np.where(anchor_class_matches > 0)[0]
     extra = len(ids) - (cf.rpn_train_anchors_per_image // 2)
     if extra > 0:
         # Reset the extra ones to neutral
         ids = np.random.choice(ids, extra, replace=False)
         anchor_class_matches[ids] = 0
 
     # Leave all negative proposals negative now and sample from them in online hard example mining.
     # For positive anchors, compute shift and scale needed to transform them to match the corresponding GT boxes.
     ids = np.where(anchor_class_matches > 0)[0]
     ix = 0  # index into anchor_delta_targets
     for i, a in zip(ids, anchors[ids]):
         # closest gt box (it might have IoU < anchor_matching_iou)
         gt = gt_boxes[anchor_iou_argmax[i]]
 
         # convert coordinates to center plus width/height.
         gt_h = gt[2] - gt[0]
         gt_w = gt[3] - gt[1]
         gt_center_y = gt[0] + 0.5 * gt_h
         gt_center_x = gt[1] + 0.5 * gt_w
         # Anchor
         a_h = a[2] - a[0]
         a_w = a[3] - a[1]
         a_center_y = a[0] + 0.5 * a_h
         a_center_x = a[1] + 0.5 * a_w
 
         if cf.dim == 2:
             anchor_delta_targets[ix] = [
                 (gt_center_y - a_center_y) / a_h,
                 (gt_center_x - a_center_x) / a_w,
                 np.log(gt_h / a_h),
                 np.log(gt_w / a_w),
             ]
 
         else:
             gt_d = gt[5] - gt[4]
             gt_center_z = gt[4] + 0.5 * gt_d
             a_d = a[5] - a[4]
             a_center_z = a[4] + 0.5 * a_d
 
             anchor_delta_targets[ix] = [
                 (gt_center_y - a_center_y) / a_h,
                 (gt_center_x - a_center_x) / a_w,
                 (gt_center_z - a_center_z) / a_d,
                 np.log(gt_h / a_h),
                 np.log(gt_w / a_w),
                 np.log(gt_d / a_d)
             ]
 
         # normalize.
         anchor_delta_targets[ix] /= cf.rpn_bbox_std_dev
         ix += 1
 
     return anchor_class_matches, anchor_delta_targets
 
 
 
 def clip_to_window(window, boxes):
     """
         window: (y1, x1, y2, x2) / 3D: (z1, z2). The window in the image we want to clip to.
         boxes: [N, (y1, x1, y2, x2)]  / 3D: (z1, z2)
     """
     boxes[:, 0] = boxes[:, 0].clamp(float(window[0]), float(window[2]))
     boxes[:, 1] = boxes[:, 1].clamp(float(window[1]), float(window[3]))
     boxes[:, 2] = boxes[:, 2].clamp(float(window[0]), float(window[2]))
     boxes[:, 3] = boxes[:, 3].clamp(float(window[1]), float(window[3]))
 
     if boxes.shape[1] > 5:
         boxes[:, 4] = boxes[:, 4].clamp(float(window[4]), float(window[5]))
         boxes[:, 5] = boxes[:, 5].clamp(float(window[4]), float(window[5]))
 
     return boxes
 
 
 ############################################################
 #  Pytorch Utility Functions
 ############################################################
 
 
 def unique1d(tensor):
     if tensor.size()[0] == 0 or tensor.size()[0] == 1:
         return tensor
     tensor = tensor.sort()[0]
     unique_bool = tensor[1:] != tensor [:-1]
-    first_element = Variable(torch.ByteTensor([True]), requires_grad=False)
+    first_element = torch.tensor([True], dtype=torch.bool, requires_grad=False)
     if tensor.is_cuda:
         first_element = first_element.cuda()
     unique_bool = torch.cat((first_element, unique_bool),dim=0)
     return tensor[unique_bool.data]
 
 
 
 def log2(x):
     """Implementatin of Log2. Pytorch doesn't have a native implemenation."""
     ln2 = Variable(torch.log(torch.FloatTensor([2.0])), requires_grad=False)
     if x.is_cuda:
         ln2 = ln2.cuda()
     return torch.log(x) / ln2
 
 
 
 def intersect1d(tensor1, tensor2):
     aux = torch.cat((tensor1, tensor2), dim=0)
     aux = aux.sort(descending=True)[0]
     return aux[:-1][(aux[1:] == aux[:-1]).data]
 
 
 
 def shem(roi_probs_neg, negative_count, ohem_poolsize):
     """
     stochastic hard example mining: from a list of indices (referring to non-matched predictions),
     determine a pool of highest scoring (worst false positives) of size negative_count*ohem_poolsize.
     Then, sample n (= negative_count) predictions of this pool as negative examples for loss.
     :param roi_probs_neg: tensor of shape (n_predictions, n_classes).
     :param negative_count: int.
     :param ohem_poolsize: int.
     :return: (negative_count).  indices refer to the positions in roi_probs_neg. If pool smaller than expected due to
     limited negative proposals availabel, this function will return sampled indices of number < negative_count without
     throwing an error.
     """
     # sort according to higehst foreground score.
     probs, order = roi_probs_neg[:, 1:].max(1)[0].sort(descending=True)
     select = torch.tensor((ohem_poolsize * int(negative_count), order.size()[0])).min().int()
     pool_indices = order[:select]
     rand_idx = torch.randperm(pool_indices.size()[0])
     return pool_indices[rand_idx[:negative_count].cuda()]
 
 
 
 def initialize_weights(net):
     """
    Initialize model weights. Current Default in Pytorch (version 0.4.1) is initialization from a uniform distriubtion.
    Will expectably be changed to kaiming_uniform in future versions.
    """
     init_type = net.cf.weight_init
 
     for m in [module for module in net.modules() if type(module) in [nn.Conv2d, nn.Conv3d,
                                                                      nn.ConvTranspose2d,
                                                                      nn.ConvTranspose3d,
                                                                      nn.Linear]]:
         if init_type == 'xavier_uniform':
             nn.init.xavier_uniform_(m.weight.data)
             if m.bias is not None:
                 m.bias.data.zero_()
 
         elif init_type == 'xavier_normal':
             nn.init.xavier_normal_(m.weight.data)
             if m.bias is not None:
                 m.bias.data.zero_()
 
         elif init_type == "kaiming_uniform":
             nn.init.kaiming_uniform_(m.weight.data, mode='fan_out', nonlinearity=net.cf.relu, a=0)
             if m.bias is not None:
                 fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(m.weight.data)
                 bound = 1 / np.sqrt(fan_out)
                 nn.init.uniform_(m.bias, -bound, bound)
 
         elif init_type == "kaiming_normal":
             nn.init.kaiming_normal_(m.weight.data, mode='fan_out', nonlinearity=net.cf.relu, a=0)
             if m.bias is not None:
                 fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(m.weight.data)
                 bound = 1 / np.sqrt(fan_out)
                 nn.init.normal_(m.bias, -bound, bound)
 
 
 
 class NDConvGenerator(object):
     """
     generic wrapper around conv-layers to avoid 2D vs. 3D distinguishing in code.
     """
     def __init__(self, dim):
         self.dim = dim
 
     def __call__(self, c_in, c_out, ks, pad=0, stride=1, norm=None, relu='relu'):
         """
         :param c_in: number of in_channels.
         :param c_out: number of out_channels.
         :param ks: kernel size.
         :param pad: pad size.
         :param stride: kernel stride.
         :param norm: string specifying type of feature map normalization. If None, no normalization is applied.
         :param relu: string specifying type of nonlinearity. If None, no nonlinearity is applied.
         :return: convolved feature_map.
         """
         if self.dim == 2:
             conv = nn.Conv2d(c_in, c_out, kernel_size=ks, padding=pad, stride=stride)
             if norm is not None:
                 if norm == 'instance_norm':
                     norm_layer = nn.InstanceNorm2d(c_out)
                 elif norm == 'batch_norm':
                     norm_layer = nn.BatchNorm2d(c_out)
                 else:
                     raise ValueError('norm type as specified in configs is not implemented...')
                 conv = nn.Sequential(conv, norm_layer)
 
         else:
             conv = nn.Conv3d(c_in, c_out, kernel_size=ks, padding=pad, stride=stride)
             if norm is not None:
                 if norm == 'instance_norm':
                     norm_layer = nn.InstanceNorm3d(c_out)
                 elif norm == 'batch_norm':
                     norm_layer = nn.BatchNorm3d(c_out)
                 else:
                     raise ValueError('norm type as specified in configs is not implemented... {}'.format(norm))
                 conv = nn.Sequential(conv, norm_layer)
 
         if relu is not None:
             if relu == 'relu':
                 relu_layer = nn.ReLU(inplace=True)
             elif relu == 'leaky_relu':
                 relu_layer = nn.LeakyReLU(inplace=True)
             else:
                 raise ValueError('relu type as specified in configs is not implemented...')
             conv = nn.Sequential(conv, relu_layer)
 
         return conv
 
 
 
 def get_one_hot_encoding(y, n_classes):
     """
     transform a numpy label array to a one-hot array of the same shape.
     :param y: array of shape (b, 1, y, x, (z)).
     :param n_classes: int, number of classes to unfold in one-hot encoding.
     :return y_ohe: array of shape (b, n_classes, y, x, (z))
     """
     dim = len(y.shape) - 2
     if dim == 2:
         y_ohe = np.zeros((y.shape[0], n_classes, y.shape[2], y.shape[3])).astype('int32')
     if dim ==3:
         y_ohe = np.zeros((y.shape[0], n_classes, y.shape[2], y.shape[3], y.shape[4])).astype('int32')
     for cl in range(n_classes):
         y_ohe[:, cl][y[:, 0] == cl] = 1
     return y_ohe
 
 
 
 def get_dice_per_batch_and_class(pred, y, n_classes):
     '''
     computes dice scores per batch instance and class.
     :param pred: prediction array of shape (b, 1, y, x, (z)) (e.g. softmax prediction with argmax over dim 1)
     :param y: ground truth array of shape (b, 1, y, x, (z)) (contains int [0, ..., n_classes]
     :param n_classes: int
     :return: dice scores of shape (b, c)
     '''
     pred = get_one_hot_encoding(pred, n_classes)
     y = get_one_hot_encoding(y, n_classes)
     axes = tuple(range(2, len(pred.shape)))
     intersect = np.sum(pred*y, axis=axes)
     denominator = np.sum(pred, axis=axes)+np.sum(y, axis=axes) + 1e-8
     dice = 2.0*intersect / denominator
     return dice
 
 
 
 def sum_tensor(input, axes, keepdim=False):
     axes = np.unique(axes)
     if keepdim:
         for ax in axes:
             input = input.sum(ax, keepdim=True)
     else:
         for ax in sorted(axes, reverse=True):
             input = input.sum(int(ax))
     return input
 
 
 
 def batch_dice(pred, y, false_positive_weight=1.0, smooth=1e-6):
     '''
     compute soft dice over batch. this is a differentiable score and can be used as a loss function.
     only dice scores of foreground classes are returned, since training typically
     does not benefit from explicit background optimization. Pixels of the entire batch are considered a pseudo-volume to compute dice scores of.
     This way, single patches with missing foreground classes can not produce faulty gradients.
     :param pred: (b, c, y, x, (z)), softmax probabilities (network output). (c==classes)
     :param y: (b, c, y, x, (z)), one-hot-encoded segmentation mask.
     :param false_positive_weight: float [0,1]. For weighting of imbalanced classes,
     reduces the penalty for false-positive pixels. Can be beneficial sometimes in data with heavy fg/bg imbalances.
     :return: soft dice score (float). This function discards the background score and returns the mean of foreground scores.
     '''
     if len(pred.size()) == 4:
         axes = (0, 2, 3)
         intersect = sum_tensor(pred * y, axes, keepdim=False)
         denom = sum_tensor(false_positive_weight*pred + y, axes, keepdim=False)
         return torch.mean(( (2 * intersect + smooth) / (denom + smooth) )[1:]) # only fg dice here.
 
     elif len(pred.size()) == 5:
         axes = (0, 2, 3, 4)
         intersect = sum_tensor(pred * y, axes, keepdim=False)
         denom = sum_tensor(false_positive_weight*pred + y, axes, keepdim=False)
         return torch.mean(( (2*intersect + smooth) / (denom + smooth) )[1:]) # only fg dice here.
 
     else:
         raise ValueError('wrong input dimension in dice loss')
 
 
 
 
 def batch_dice_mask(pred, y, mask, false_positive_weight=1.0, smooth=1e-6):
     '''
     compute soft dice over batch. this is a diffrentiable score and can be used as a loss function.
     only dice scores of foreground classes are returned, since training typically
     does not benefit from explicit background optimization. Pixels of the entire batch are considered a pseudo-volume to compute dice scores of.
     This way, single patches with missing foreground classes can not produce faulty gradients.
     :param pred: (b, c, y, x, (z)), softmax probabilities (network output).
     :param y: (b, c, y, x, (z)), one hote encoded segmentation mask.
     :param false_positive_weight: float [0,1]. For weighting of imbalanced classes,
     reduces the penalty for false-positive pixels. Can be beneficial sometimes in data with heavy fg/bg imbalances.
     :return: soft dice score (float). This function discards the background score and returns the mean of foreground scores.
     '''
 
     mask = mask.unsqueeze(1).repeat(1, 2, 1, 1)
 
     if len(pred.size()) == 4:
         axes = (0, 2, 3)
         intersect = sum_tensor(pred * y * mask, axes, keepdim=False)
         denom = sum_tensor(false_positive_weight*pred * mask + y * mask, axes, keepdim=False)
         return torch.mean(( (2*intersect + smooth) / (denom + smooth))[1:]) # only fg dice here.
 
     elif len(pred.size()) == 5:
         axes = (0, 2, 3, 4)
         intersect = sum_tensor(pred * y, axes, keepdim=False)
         denom = sum_tensor(false_positive_weight*pred + y, axes, keepdim=False)
         return torch.mean(( (2*intersect + smooth) / (denom + smooth) )[1:]) # only fg dice here.
 
     else:
         raise ValueError('wrong input dimension in dice loss')
\ No newline at end of file