diff --git a/cuda_functions/nms_2D/__init__.py b/cuda_functions/nms_2D/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/nms_2D/_ext/__init__.py b/cuda_functions/nms_2D/_ext/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/nms_2D/_ext/nms/__init__.py b/cuda_functions/nms_2D/_ext/nms/__init__.py
deleted file mode 100644
index d71786f..0000000
--- a/cuda_functions/nms_2D/_ext/nms/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-from torch.utils.ffi import _wrap_function
-from ._nms import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
diff --git a/cuda_functions/nms_2D/_ext/nms/_nms.so b/cuda_functions/nms_2D/_ext/nms/_nms.so
deleted file mode 100755
index 1856faf..0000000
Binary files a/cuda_functions/nms_2D/_ext/nms/_nms.so and /dev/null differ
diff --git a/cuda_functions/nms_2D/build.py b/cuda_functions/nms_2D/build.py
deleted file mode 100644
index 4d9a96b..0000000
--- a/cuda_functions/nms_2D/build.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import torch
-from torch.utils.ffi import create_extension
-
-
-sources = ['src/nms.c']
-headers = ['src/nms.h']
-defines = []
-with_cuda = False
-
-if torch.cuda.is_available():
-    print('Including CUDA code.')
-    sources += ['src/nms_cuda.c']
-    headers += ['src/nms_cuda.h']
-    defines += [('WITH_CUDA', None)]
-    with_cuda = True
-
-this_file = os.path.dirname(os.path.realpath(__file__))
-print(this_file)
-extra_objects = ['src/cuda/nms_kernel.cu.o']
-extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
-
-ffi = create_extension(
-    '_ext.nms',
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_objects=extra_objects
-)
-
-if __name__ == '__main__':
-    ffi.build()
diff --git a/cuda_functions/nms_2D/pth_nms.py b/cuda_functions/nms_2D/pth_nms.py
deleted file mode 100644
index bfdc29a..0000000
--- a/cuda_functions/nms_2D/pth_nms.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import torch
-from ._ext import nms
-
-
-def nms_gpu(dets, thresh):
-  """
-  dets has to be a tensor
-  """
-
-  scores = dets[:, 4]
-  order = scores.sort(0, descending=True)[1]
-  dets = dets[order].contiguous()
-
-  keep = torch.LongTensor(dets.size(0))
-  num_out = torch.LongTensor(1)
-  nms.gpu_nms(keep, num_out, dets, thresh)
-  return order[keep[:num_out[0]].cuda()].contiguous()
-
-
-
-def nms_cpu(dets, thresh):
-
-  dets = dets.cpu()
-  x1 = dets[:, 0]
-  y1 = dets[:, 1]
-  x2 = dets[:, 2]
-  y2 = dets[:, 3]
-  scores = dets[:, 4]
-
-  areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-  order = scores.sort(0, descending=True)[1]
-  # order = torch.from_numpy(np.ascontiguousarray(scores.numpy().argsort()[::-1])).long()
-
-  keep = torch.LongTensor(dets.size(0))
-  num_out = torch.LongTensor(1)
-  nms.cpu_nms(keep, num_out, dets, order, areas, thresh)
-
-  return keep[:num_out[0]]
-
diff --git a/cuda_functions/nms_2D/src/cuda/nms_kernel.cu b/cuda_functions/nms_2D/src/cuda/nms_kernel.cu
deleted file mode 100644
index 1174f22..0000000
--- a/cuda_functions/nms_2D/src/cuda/nms_kernel.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// ------------------------------------------------------------------
-// Faster R-CNN
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
-// Written by Shaoqing Ren
-// ------------------------------------------------------------------
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <float.h>
-#include "nms_kernel.h"
-
-__device__ inline float devIoU(float const * const a, float const * const b) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  float width = fmaxf(right - left + 1, 0.f), height = fmaxf(bottom - top + 1, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
-                           const float *dev_boxes, unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size =
-        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-  __shared__ float block_boxes[threadsPerBlock * 5];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 5;
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-
-void _nms(int boxes_num, float * boxes_dev,
-          unsigned long long * mask_dev, float nms_overlap_thresh) {
-
-  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
-              DIVUP(boxes_num, threadsPerBlock));
-  dim3 threads(threadsPerBlock);
-  nms_kernel<<<blocks, threads>>>(boxes_num,
-                                  nms_overlap_thresh,
-                                  boxes_dev,
-                                  mask_dev);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/cuda_functions/nms_2D/src/cuda/nms_kernel.cu.o b/cuda_functions/nms_2D/src/cuda/nms_kernel.cu.o
deleted file mode 100644
index 00135bf..0000000
Binary files a/cuda_functions/nms_2D/src/cuda/nms_kernel.cu.o and /dev/null differ
diff --git a/cuda_functions/nms_2D/src/cuda/nms_kernel.h b/cuda_functions/nms_2D/src/cuda/nms_kernel.h
deleted file mode 100644
index 2f40582..0000000
--- a/cuda_functions/nms_2D/src/cuda/nms_kernel.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _NMS_KERNEL
-#define _NMS_KERNEL
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-void _nms(int boxes_num, float * boxes_dev,
-          unsigned long long * mask_dev, float nms_overlap_thresh);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
diff --git a/cuda_functions/nms_2D/src/nms.c b/cuda_functions/nms_2D/src/nms.c
deleted file mode 100644
index 4795cc1..0000000
--- a/cuda_functions/nms_2D/src/nms.c
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <TH/TH.h>
-#include <math.h>
-
-int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh) {
-    // boxes has to be sorted
-    THArgCheck(THLongTensor_isContiguous(keep_out), 0, "keep_out must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(boxes), 2, "boxes must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(order), 3, "order must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(areas), 4, "areas must be contiguous");
-    // Number of ROIs
-    long boxes_num = THFloatTensor_size(boxes, 0);
-    long boxes_dim = THFloatTensor_size(boxes, 1);
-
-    long * keep_out_flat = THLongTensor_data(keep_out);
-    float * boxes_flat = THFloatTensor_data(boxes);
-    long * order_flat = THLongTensor_data(order);
-    float * areas_flat = THFloatTensor_data(areas);
-
-    THByteTensor* suppressed = THByteTensor_newWithSize1d(boxes_num);
-    THByteTensor_fill(suppressed, 0);
-    unsigned char * suppressed_flat =  THByteTensor_data(suppressed);
-
-    // nominal indices
-    int i, j;
-    // sorted indices
-    int _i, _j;
-    // temp variables for box i's (the box currently under consideration)
-    float ix1, iy1, ix2, iy2, iarea;
-    // variables for computing overlap with box j (lower scoring box)
-    float xx1, yy1, xx2, yy2;
-    float w, h;
-    float inter, ovr;
-
-    long num_to_keep = 0;
-    for (_i=0; _i < boxes_num; ++_i) {
-        i = order_flat[_i];
-        if (suppressed_flat[i] == 1) {
-            continue;
-        }
-        keep_out_flat[num_to_keep++] = i;
-        ix1 = boxes_flat[i * boxes_dim];
-        iy1 = boxes_flat[i * boxes_dim + 1];
-        ix2 = boxes_flat[i * boxes_dim + 2];
-        iy2 = boxes_flat[i * boxes_dim + 3];
-        iarea = areas_flat[i];
-        for (_j = _i + 1; _j < boxes_num; ++_j) {
-            j = order_flat[_j];
-            if (suppressed_flat[j] == 1) {
-                continue;
-            }
-            xx1 = fmaxf(ix1, boxes_flat[j * boxes_dim]);
-            yy1 = fmaxf(iy1, boxes_flat[j * boxes_dim + 1]);
-            xx2 = fminf(ix2, boxes_flat[j * boxes_dim + 2]);
-            yy2 = fminf(iy2, boxes_flat[j * boxes_dim + 3]);
-            w = fmaxf(0.0, xx2 - xx1 + 1);
-            h = fmaxf(0.0, yy2 - yy1 + 1);
-            inter = w * h;
-            ovr = inter / (iarea + areas_flat[j] - inter);
-            if (ovr >= nms_overlap_thresh) {
-                suppressed_flat[j] = 1;
-            }
-        }
-    }
-
-    long *num_out_flat = THLongTensor_data(num_out);
-    *num_out_flat = num_to_keep;
-    THByteTensor_free(suppressed);
-    return 1;
-}
\ No newline at end of file
diff --git a/cuda_functions/nms_2D/src/nms.h b/cuda_functions/nms_2D/src/nms.h
deleted file mode 100644
index 25ca0a3..0000000
--- a/cuda_functions/nms_2D/src/nms.h
+++ /dev/null
@@ -1 +0,0 @@
-int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh);
\ No newline at end of file
diff --git a/cuda_functions/nms_2D/src/nms_cuda.c b/cuda_functions/nms_2D/src/nms_cuda.c
deleted file mode 100644
index 5a9a70f..0000000
--- a/cuda_functions/nms_2D/src/nms_cuda.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// ------------------------------------------------------------------
-// Faster R-CNN
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
-// Written by Shaoqing Ren
-// ------------------------------------------------------------------
-#include <THC/THC.h>
-#include <TH/TH.h>
-#include <math.h>
-#include <stdio.h>
-
-#include "cuda/nms_kernel.h"
-
-
-extern THCState *state;
-
-int gpu_nms(THLongTensor * keep, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh) {
-  // boxes has to be sorted
-  THArgCheck(THLongTensor_isContiguous(keep), 0, "boxes must be contiguous");
-  THArgCheck(THCudaTensor_isContiguous(state, boxes), 2, "boxes must be contiguous");
-  // Number of ROIs
-  int boxes_num = THCudaTensor_size(state, boxes, 0);
-  int boxes_dim = THCudaTensor_size(state, boxes, 1);
-
-  float* boxes_flat = THCudaTensor_data(state, boxes);
-
-  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
-  THCudaLongTensor * mask = THCudaLongTensor_newWithSize2d(state, boxes_num, col_blocks);
-  unsigned long long* mask_flat = THCudaLongTensor_data(state, mask);
-
-  _nms(boxes_num, boxes_flat, mask_flat, nms_overlap_thresh);
-
-  THLongTensor * mask_cpu = THLongTensor_newWithSize2d(boxes_num, col_blocks);
-  THLongTensor_copyCuda(state, mask_cpu, mask);
-  THCudaLongTensor_free(state, mask);
-
-  unsigned long long * mask_cpu_flat = THLongTensor_data(mask_cpu);
-
-  THLongTensor * remv_cpu = THLongTensor_newWithSize1d(col_blocks);
-  unsigned long long* remv_cpu_flat = THLongTensor_data(remv_cpu);
-  THLongTensor_fill(remv_cpu, 0);
-
-  long * keep_flat = THLongTensor_data(keep);
-  long num_to_keep = 0;
-
-  int i, j;
-  for (i = 0; i < boxes_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv_cpu_flat[nblock] & (1ULL << inblock))) {
-      keep_flat[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu_flat[0] + i * col_blocks;
-      for (j = nblock; j < col_blocks; j++) {
-        remv_cpu_flat[j] |= p[j];
-      }
-    }
-  }
-
-  long * num_out_flat = THLongTensor_data(num_out);
-  * num_out_flat = num_to_keep;
-
-  THLongTensor_free(mask_cpu);
-  THLongTensor_free(remv_cpu);
-
-  return 1;
-}
diff --git a/cuda_functions/nms_2D/src/nms_cuda.h b/cuda_functions/nms_2D/src/nms_cuda.h
deleted file mode 100644
index 0826111..0000000
--- a/cuda_functions/nms_2D/src/nms_cuda.h
+++ /dev/null
@@ -1 +0,0 @@
-int gpu_nms(THLongTensor * keep_out, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh);
\ No newline at end of file
diff --git a/cuda_functions/nms_3D/__init__.py b/cuda_functions/nms_3D/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/nms_3D/_ext/__init__.py b/cuda_functions/nms_3D/_ext/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/nms_3D/_ext/nms/__init__.py b/cuda_functions/nms_3D/_ext/nms/__init__.py
deleted file mode 100644
index d71786f..0000000
--- a/cuda_functions/nms_3D/_ext/nms/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-from torch.utils.ffi import _wrap_function
-from ._nms import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
diff --git a/cuda_functions/nms_3D/_ext/nms/_nms.so b/cuda_functions/nms_3D/_ext/nms/_nms.so
deleted file mode 100755
index c8498a0..0000000
Binary files a/cuda_functions/nms_3D/_ext/nms/_nms.so and /dev/null differ
diff --git a/cuda_functions/nms_3D/build.py b/cuda_functions/nms_3D/build.py
deleted file mode 100644
index 4d9a96b..0000000
--- a/cuda_functions/nms_3D/build.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import torch
-from torch.utils.ffi import create_extension
-
-
-sources = ['src/nms.c']
-headers = ['src/nms.h']
-defines = []
-with_cuda = False
-
-if torch.cuda.is_available():
-    print('Including CUDA code.')
-    sources += ['src/nms_cuda.c']
-    headers += ['src/nms_cuda.h']
-    defines += [('WITH_CUDA', None)]
-    with_cuda = True
-
-this_file = os.path.dirname(os.path.realpath(__file__))
-print(this_file)
-extra_objects = ['src/cuda/nms_kernel.cu.o']
-extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
-
-ffi = create_extension(
-    '_ext.nms',
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_objects=extra_objects
-)
-
-if __name__ == '__main__':
-    ffi.build()
diff --git a/cuda_functions/nms_3D/pth_nms.py b/cuda_functions/nms_3D/pth_nms.py
deleted file mode 100644
index 3639b5b..0000000
--- a/cuda_functions/nms_3D/pth_nms.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-from ._ext import nms
-
-
-def nms_gpu(dets, thresh):
-  """
-  dets has to be a tensor
-  """
-
-  scores = dets[:, -1]
-  order = scores.sort(0, descending=True)[1]
-  dets = dets[order].contiguous()
-
-  keep = torch.LongTensor(dets.size(0))
-  num_out = torch.LongTensor(1)
-  nms.gpu_nms(keep, num_out, dets, thresh)
-  return order[keep[:num_out[0]].cuda()].contiguous()
-
-
-def nms_cpu(dets, thresh):
-
-  dets = dets.cpu()
-  x1 = dets[:, 0]
-  y1 = dets[:, 1]
-  x2 = dets[:, 2]
-  y2 = dets[:, 3]
-  z1 = dets[:, 4]
-  z2 = dets[:, 5]
-  scores = dets[:, 6]
-  areas = (x2 - x1 +1) * (y2 - y1 +1) * (z2 - z1 +1)
-  order = scores.sort(0, descending=True)[1]
-
-  keep = torch.LongTensor(dets.size(0))
-  num_out = torch.LongTensor(1)
-  nms.cpu_nms(keep, num_out, dets, order, areas, thresh)
-
-  return keep[:num_out[0]]
-
diff --git a/cuda_functions/nms_3D/src/cuda/nms_kernel.cu b/cuda_functions/nms_3D/src/cuda/nms_kernel.cu
deleted file mode 100644
index 5692de8..0000000
--- a/cuda_functions/nms_3D/src/cuda/nms_kernel.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-// ------------------------------------------------------------------
-// Faster R-CNN
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
-// Written by Shaoqing Ren
-// ------------------------------------------------------------------
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <float.h>
-#include "nms_kernel.h"
-
-__device__ inline float devIoU(float const * const a, float const * const b) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  float front = fmaxf(a[4], b[4]), back = fminf(a[5], b[5]);
-
-  float width = fmaxf(right - left + 1, 0.f), height = fmaxf(bottom - top + 1, 0.f), depth = fmaxf(back - front + 1, 0.f);
-  float interS = width * height * depth;
-  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1) * (a[5] - a[4] + 1);
-  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1) * (b[5] - b[4] + 1);
-  //printf("IoU 3D %f \n", interS / (Sa + Sb - interS));
-
-  return interS / (Sa + Sb - interS);
-}
-
-__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
-                           const float *dev_boxes, unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size =
-        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-  __shared__ float block_boxes[threadsPerBlock * 7];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 7 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 0];
-    block_boxes[threadIdx.x * 7 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 1];
-    block_boxes[threadIdx.x * 7 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 2];
-    block_boxes[threadIdx.x * 7 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 3];
-    block_boxes[threadIdx.x * 7 + 4] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 4];
-    block_boxes[threadIdx.x * 7 + 5] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 5];
-    block_boxes[threadIdx.x * 7 + 6] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 7 + 6];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 7;
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-
-void _nms(int boxes_num, float * boxes_dev,
-          unsigned long long * mask_dev, float nms_overlap_thresh) {
-
-
-  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
-              DIVUP(boxes_num, threadsPerBlock));
-  dim3 threads(threadsPerBlock);
-  nms_kernel<<<blocks, threads>>>(boxes_num,
-                                  nms_overlap_thresh,
-                                  boxes_dev,
-                                  mask_dev);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/cuda_functions/nms_3D/src/cuda/nms_kernel.cu.o b/cuda_functions/nms_3D/src/cuda/nms_kernel.cu.o
deleted file mode 100644
index ee3ed41..0000000
Binary files a/cuda_functions/nms_3D/src/cuda/nms_kernel.cu.o and /dev/null differ
diff --git a/cuda_functions/nms_3D/src/cuda/nms_kernel.h b/cuda_functions/nms_3D/src/cuda/nms_kernel.h
deleted file mode 100644
index 2f40582..0000000
--- a/cuda_functions/nms_3D/src/cuda/nms_kernel.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _NMS_KERNEL
-#define _NMS_KERNEL
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-void _nms(int boxes_num, float * boxes_dev,
-          unsigned long long * mask_dev, float nms_overlap_thresh);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
diff --git a/cuda_functions/nms_3D/src/nms.c b/cuda_functions/nms_3D/src/nms.c
deleted file mode 100644
index dd64336..0000000
--- a/cuda_functions/nms_3D/src/nms.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <TH/TH.h>
-#include <math.h>
-
-
-int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh) {
-    // boxes has to be sorted
-    THArgCheck(THLongTensor_isContiguous(keep_out), 0, "keep_out must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(boxes), 2, "boxes must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(order), 3, "order must be contiguous");
-    THArgCheck(THLongTensor_isContiguous(areas), 4, "areas must be contiguous");
-    // Number of ROIs
-    long boxes_num = THFloatTensor_size(boxes, 0);
-    long boxes_dim = THFloatTensor_size(boxes, 1);
-
-    long * keep_out_flat = THLongTensor_data(keep_out);
-    float * boxes_flat = THFloatTensor_data(boxes);
-    long * order_flat = THLongTensor_data(order);
-    float * areas_flat = THFloatTensor_data(areas);
-
-    THByteTensor* suppressed = THByteTensor_newWithSize1d(boxes_num);
-    THByteTensor_fill(suppressed, 0);
-    unsigned char * suppressed_flat =  THByteTensor_data(suppressed);
-    // nominal indices
-    int i, j;
-    // sorted indices
-    int _i, _j;
-    // temp variables for box i's (the box currently under consideration)
-    float ix1, iy1, ix2, iy2, iz1, iz2, iarea;
-    // variables for computing overlap with box j (lower scoring box)
-    float xx1, yy1, xx2, yy2, zz1, zz2;
-    float w, h, d;
-    float inter, ovr;
-
-    long num_to_keep = 0;
-    for (_i=0; _i < boxes_num; ++_i) {
-        i = order_flat[_i]; // from sorted index to nominal index in boxes list.
-        if (suppressed_flat[i] == 1) { //maybe flag for later. overlapping boxes are surpressed.
-            continue;
-        }
-        keep_out_flat[num_to_keep++] = i; //num to keep is read and then increased. the box index i is saved in keep_out.
-        ix1 = boxes_flat[i * boxes_dim];
-        iy1 = boxes_flat[i * boxes_dim + 1];
-        ix2 = boxes_flat[i * boxes_dim + 2];
-        iy2 = boxes_flat[i * boxes_dim + 3];
-        iz1 = boxes_flat[i * boxes_dim + 4];
-        iz2 = boxes_flat[i * boxes_dim + 5];
-        iarea = areas_flat[i];
-        for (_j = _i + 1; _j < boxes_num; ++_j) {
-            j = order_flat[_j];
-            if (suppressed_flat[j] == 1) {
-                continue;
-            }
-            xx1 = fmaxf(ix1, boxes_flat[j * boxes_dim]);
-            yy1 = fmaxf(iy1, boxes_flat[j * boxes_dim + 1]);
-            xx2 = fminf(ix2, boxes_flat[j * boxes_dim + 2]);
-            yy2 = fminf(iy2, boxes_flat[j * boxes_dim + 3]);
-            zz1 = fmaxf(iz1, boxes_flat[j * boxes_dim + 4]);
-            zz2 = fminf(iz2, boxes_flat[j * boxes_dim + 5]);
-            w = fmaxf(0.0, xx2 - xx1 + 1);
-            h = fmaxf(0.0, yy2 - yy1 + 1);
-            d = fmaxf(0.0, zz2 - zz1 + 1);
-            inter = w * h * d;
-            ovr = inter / (iarea + areas_flat[j] - inter);
-            if (ovr >= nms_overlap_thresh) {
-                suppressed_flat[j] = 1; // can be surpressed because score j < score i (from order: _j = _i + 1 ...)
-            }
-        }
-    }
-
-    long *num_out_flat = THLongTensor_data(num_out);
-    *num_out_flat = num_to_keep;
-    THByteTensor_free(suppressed);
-    return 1;
-}
\ No newline at end of file
diff --git a/cuda_functions/nms_3D/src/nms.h b/cuda_functions/nms_3D/src/nms.h
deleted file mode 100644
index d17d9c9..0000000
--- a/cuda_functions/nms_3D/src/nms.h
+++ /dev/null
@@ -1 +0,0 @@
-int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh);
diff --git a/cuda_functions/nms_3D/src/nms_cuda.c b/cuda_functions/nms_3D/src/nms_cuda.c
deleted file mode 100644
index 5a9a70f..0000000
--- a/cuda_functions/nms_3D/src/nms_cuda.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// ------------------------------------------------------------------
-// Faster R-CNN
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
-// Written by Shaoqing Ren
-// ------------------------------------------------------------------
-#include <THC/THC.h>
-#include <TH/TH.h>
-#include <math.h>
-#include <stdio.h>
-
-#include "cuda/nms_kernel.h"
-
-
-extern THCState *state;
-
-int gpu_nms(THLongTensor * keep, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh) {
-  // boxes has to be sorted
-  THArgCheck(THLongTensor_isContiguous(keep), 0, "boxes must be contiguous");
-  THArgCheck(THCudaTensor_isContiguous(state, boxes), 2, "boxes must be contiguous");
-  // Number of ROIs
-  int boxes_num = THCudaTensor_size(state, boxes, 0);
-  int boxes_dim = THCudaTensor_size(state, boxes, 1);
-
-  float* boxes_flat = THCudaTensor_data(state, boxes);
-
-  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
-  THCudaLongTensor * mask = THCudaLongTensor_newWithSize2d(state, boxes_num, col_blocks);
-  unsigned long long* mask_flat = THCudaLongTensor_data(state, mask);
-
-  _nms(boxes_num, boxes_flat, mask_flat, nms_overlap_thresh);
-
-  THLongTensor * mask_cpu = THLongTensor_newWithSize2d(boxes_num, col_blocks);
-  THLongTensor_copyCuda(state, mask_cpu, mask);
-  THCudaLongTensor_free(state, mask);
-
-  unsigned long long * mask_cpu_flat = THLongTensor_data(mask_cpu);
-
-  THLongTensor * remv_cpu = THLongTensor_newWithSize1d(col_blocks);
-  unsigned long long* remv_cpu_flat = THLongTensor_data(remv_cpu);
-  THLongTensor_fill(remv_cpu, 0);
-
-  long * keep_flat = THLongTensor_data(keep);
-  long num_to_keep = 0;
-
-  int i, j;
-  for (i = 0; i < boxes_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv_cpu_flat[nblock] & (1ULL << inblock))) {
-      keep_flat[num_to_keep++] = i;
-      unsigned long long *p = &mask_cpu_flat[0] + i * col_blocks;
-      for (j = nblock; j < col_blocks; j++) {
-        remv_cpu_flat[j] |= p[j];
-      }
-    }
-  }
-
-  long * num_out_flat = THLongTensor_data(num_out);
-  * num_out_flat = num_to_keep;
-
-  THLongTensor_free(mask_cpu);
-  THLongTensor_free(remv_cpu);
-
-  return 1;
-}
diff --git a/cuda_functions/nms_3D/src/nms_cuda.h b/cuda_functions/nms_3D/src/nms_cuda.h
deleted file mode 100644
index 08bf147..0000000
--- a/cuda_functions/nms_3D/src/nms_cuda.h
+++ /dev/null
@@ -1 +0,0 @@
-int gpu_nms(THLongTensor * keep_out, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh);
diff --git a/cuda_functions/roi_align_2D/__init__.py b/cuda_functions/roi_align_2D/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_2D/roi_align/__init__.py b/cuda_functions/roi_align_2D/roi_align/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_2D/roi_align/_ext/__init__.py b/cuda_functions/roi_align_2D/roi_align/_ext/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/__init__.py b/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/__init__.py
deleted file mode 100644
index 4486c09..0000000
--- a/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-from torch.utils.ffi import _wrap_function
-from ._crop_and_resize import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
diff --git a/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/_crop_and_resize.so b/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/_crop_and_resize.so
deleted file mode 100755
index e852f11..0000000
Binary files a/cuda_functions/roi_align_2D/roi_align/_ext/crop_and_resize/_crop_and_resize.so and /dev/null differ
diff --git a/cuda_functions/roi_align_2D/roi_align/build.py b/cuda_functions/roi_align_2D/roi_align/build.py
deleted file mode 100755
index 3798d82..0000000
--- a/cuda_functions/roi_align_2D/roi_align/build.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import torch
-from torch.utils.ffi import create_extension
-
-
-sources = ['src/crop_and_resize.c']
-headers = ['src/crop_and_resize.h']
-defines = []
-with_cuda = False
-
-extra_objects = []
-if torch.cuda.is_available():
-    print('Including CUDA code.')
-    sources += ['src/crop_and_resize_gpu.c']
-    headers += ['src/crop_and_resize_gpu.h']
-    defines += [('WITH_CUDA', None)]
-    extra_objects += ['src/cuda/crop_and_resize_kernel.cu.o']
-    with_cuda = True
-
-extra_compile_args = ['-fopenmp', '-std=c99']
-
-this_file = os.path.dirname(os.path.realpath(__file__))
-print(this_file)
-sources = [os.path.join(this_file, fname) for fname in sources]
-headers = [os.path.join(this_file, fname) for fname in headers]
-extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
-
-ffi = create_extension(
-    '_ext.crop_and_resize',
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_objects=extra_objects,
-    extra_compile_args=extra_compile_args
-)
-
-if __name__ == '__main__':
-    ffi.build()
diff --git a/cuda_functions/roi_align_2D/roi_align/crop_and_resize.py b/cuda_functions/roi_align_2D/roi_align/crop_and_resize.py
deleted file mode 100755
index 4291ae4..0000000
--- a/cuda_functions/roi_align_2D/roi_align/crop_and_resize.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function
-
-from ._ext import crop_and_resize as _backend
-
-
-class CropAndResizeFunction(Function):
-
-    def __init__(self, crop_height, crop_width, extrapolation_value=0):
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.extrapolation_value = extrapolation_value
-
-    def forward(self, image, boxes, box_ind):
-        crops = torch.zeros_like(image)
-        if image.is_cuda:
-            _backend.crop_and_resize_gpu_forward(
-                image, boxes, box_ind,
-                self.extrapolation_value, self.crop_height, self.crop_width, crops)
-        else:
-            _backend.crop_and_resize_forward(
-                image, boxes, box_ind,
-                self.extrapolation_value, self.crop_height, self.crop_width, crops)
-
-        # save for backward
-        self.im_size = image.size()
-        self.save_for_backward(boxes, box_ind)
-
-        return crops
-
-    def backward(self, grad_outputs):
-        boxes, box_ind = self.saved_tensors
-
-        grad_outputs = grad_outputs.contiguous()
-        grad_image = torch.zeros_like(grad_outputs).resize_(*self.im_size)
-
-        if grad_outputs.is_cuda:
-            _backend.crop_and_resize_gpu_backward(
-                grad_outputs, boxes, box_ind, grad_image
-            )
-        else:
-            _backend.crop_and_resize_backward(
-                grad_outputs, boxes, box_ind, grad_image
-            )
-
-        return grad_image, None, None
-
-
-class CropAndResize(nn.Module):
-    """
-    Crop and resize ported from tensorflow
-    See more details on https://www.tensorflow.org/api_docs/python/tf/image/crop_and_resize
-    """
-
-    def __init__(self, crop_height, crop_width, extrapolation_value=0):
-        super(CropAndResize, self).__init__()
-
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.extrapolation_value = extrapolation_value
-
-    def forward(self, image, boxes, box_ind):
-        return CropAndResizeFunction(self.crop_height, self.crop_width, self.extrapolation_value)(image, boxes, box_ind)
diff --git a/cuda_functions/roi_align_2D/roi_align/roi_align.py b/cuda_functions/roi_align_2D/roi_align/roi_align.py
deleted file mode 100644
index 6931539..0000000
--- a/cuda_functions/roi_align_2D/roi_align/roi_align.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from torch import nn
-
-from .crop_and_resize import CropAndResizeFunction, CropAndResize
-
-
-class RoIAlign(nn.Module):
-
-    def __init__(self, crop_height, crop_width, extrapolation_value=0, transform_fpcoor=True):
-        super(RoIAlign, self).__init__()
-
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.extrapolation_value = extrapolation_value
-        self.transform_fpcoor = transform_fpcoor
-
-    def forward(self, featuremap, boxes, box_ind):
-        """
-        RoIAlign based on crop_and_resize.
-        See more details on https://github.com/ppwwyyxx/tensorpack/blob/6d5ba6a970710eaaa14b89d24aace179eb8ee1af/examples/FasterRCNN/model.py#L301
-        :param featuremap: NxCxHxW
-        :param boxes: Mx4 float box with (x1, y1, x2, y2) **without normalization**
-        :param box_ind: M
-        :return: MxCxoHxoW
-        """
-        x1, y1, x2, y2 = torch.split(boxes, 1, dim=1)
-        image_height, image_width = featuremap.size()[2:4]
-
-        if self.transform_fpcoor:
-            spacing_w = (x2 - x1) / float(self.crop_width)
-            spacing_h = (y2 - y1) / float(self.crop_height)
-
-            nx0 = (x1 + spacing_w / 2 - 0.5) / float(image_width - 1)
-            ny0 = (y1 + spacing_h / 2 - 0.5) / float(image_height - 1)
-            nw = spacing_w * float(self.crop_width - 1) / float(image_width - 1)
-            nh = spacing_h * float(self.crop_height - 1) / float(image_height - 1)
-
-            boxes = torch.cat((ny0, nx0, ny0 + nh, nx0 + nw), 1)
-        else:
-            x1 = x1 / float(image_width - 1)
-            x2 = x2 / float(image_width - 1)
-            y1 = y1 / float(image_height - 1)
-            y2 = y2 / float(image_height - 1)
-            boxes = torch.cat((y1, x1, y2, x2), 1)
-
-        boxes = boxes.detach().contiguous()
-        box_ind = box_ind.detach()
-        return CropAndResizeFunction(self.crop_height, self.crop_width, self.extrapolation_value)(featuremap, boxes, box_ind)
diff --git a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.c b/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.c
deleted file mode 100644
index e1fce67..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.c
+++ /dev/null
@@ -1,252 +0,0 @@
-#include <TH/TH.h>
-#include <stdio.h>
-#include <math.h>
-
-
-void CropAndResizePerBox(
-    const float * image_data, 
-    const int batch_size,
-    const int depth,
-    const int image_height,
-    const int image_width,
-
-    const float * boxes_data, 
-    const int * box_index_data,
-    const int start_box, 
-    const int limit_box,
-
-    float * corps_data,
-    const int crop_height,
-    const int crop_width,
-    const float extrapolation_value
-) {
-    const int image_channel_elements = image_height * image_width;
-    const int image_elements = depth * image_channel_elements;
-
-    const int channel_elements = crop_height * crop_width;
-    const int crop_elements = depth * channel_elements;
-
-    int b;
-    #pragma omp parallel for
-    for (b = start_box; b < limit_box; ++b) {
-        const float * box = boxes_data + b * 4;
-        const float y1 = box[0];
-        const float x1 = box[1];
-        const float y2 = box[2];
-        const float x2 = box[3];
-
-        const int b_in = box_index_data[b];
-        if (b_in < 0 || b_in >= batch_size) {
-            printf("Error: batch_index %d out of range [0, %d)\n", b_in, batch_size);
-            exit(-1);
-        }
-
-        const float height_scale =
-            (crop_height > 1)
-                ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
-                             : 0;
-
-        for (int y = 0; y < crop_height; ++y)
-        {
-            const float in_y = (crop_height > 1)
-                                   ? y1 * (image_height - 1) + y * height_scale
-                                   : 0.5 * (y1 + y2) * (image_height - 1);
-
-            if (in_y < 0 || in_y > image_height - 1)
-            {
-                for (int x = 0; x < crop_width; ++x)
-                {
-                    for (int d = 0; d < depth; ++d)
-                    {
-                        // crops(b, y, x, d) = extrapolation_value;
-                        corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = extrapolation_value;
-                    }
-                }
-                continue;
-            }
-            
-            const int top_y_index = floorf(in_y);
-            const int bottom_y_index = ceilf(in_y);
-            const float y_lerp = in_y - top_y_index;
-
-            for (int x = 0; x < crop_width; ++x)
-            {
-                const float in_x = (crop_width > 1)
-                                       ? x1 * (image_width - 1) + x * width_scale
-                                       : 0.5 * (x1 + x2) * (image_width - 1);
-                if (in_x < 0 || in_x > image_width - 1)
-                {
-                    for (int d = 0; d < depth; ++d)
-                    {
-                        corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = extrapolation_value;
-                    }
-                    continue;
-                }
-            
-                const int left_x_index = floorf(in_x);
-                const int right_x_index = ceilf(in_x);
-                const float x_lerp = in_x - left_x_index;
-
-                for (int d = 0; d < depth; ++d)
-                {   
-                    const float *pimage = image_data + b_in * image_elements + d * image_channel_elements;
-
-                    const float top_left = pimage[top_y_index * image_width + left_x_index];
-                    const float top_right = pimage[top_y_index * image_width + right_x_index];
-                    const float bottom_left = pimage[bottom_y_index * image_width + left_x_index];
-                    const float bottom_right = pimage[bottom_y_index * image_width + right_x_index];
-                    
-                    const float top = top_left + (top_right - top_left) * x_lerp;
-                    const float bottom =
-                        bottom_left + (bottom_right - bottom_left) * x_lerp;
-                        
-                    corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = top + (bottom - top) * y_lerp;
-                }
-            }   // end for x
-        }   // end for y
-    }   // end for b
-
-}
-
-
-void crop_and_resize_forward(
-    THFloatTensor * image,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THFloatTensor * crops
-) {
-    const int batch_size = image->size[0];
-    const int depth = image->size[1];
-    const int image_height = image->size[2];
-    const int image_width = image->size[3];
-
-    const int num_boxes = boxes->size[0];
-
-    // init output space
-    THFloatTensor_resize4d(crops, num_boxes, depth, crop_height, crop_width);
-    THFloatTensor_zero(crops);
-
-    // crop_and_resize for each box
-    CropAndResizePerBox(
-        THFloatTensor_data(image),
-        batch_size,
-        depth,
-        image_height,
-        image_width,
-
-        THFloatTensor_data(boxes),
-        THIntTensor_data(box_index),
-        0,
-        num_boxes,
-
-        THFloatTensor_data(crops),
-        crop_height,
-        crop_width,
-        extrapolation_value
-    );
-
-}
-
-
-void crop_and_resize_backward(
-    THFloatTensor * grads,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    THFloatTensor * grads_image // resize to [bsize, c, hc, wc]
-)
-{   
-    // shape
-    const int batch_size = grads_image->size[0];
-    const int depth = grads_image->size[1];
-    const int image_height = grads_image->size[2];
-    const int image_width = grads_image->size[3];
-
-    const int num_boxes = grads->size[0];
-    const int crop_height = grads->size[2];
-    const int crop_width = grads->size[3];
-
-    // n_elements
-    const int image_channel_elements = image_height * image_width;
-    const int image_elements = depth * image_channel_elements;
-
-    const int channel_elements = crop_height * crop_width;
-    const int crop_elements = depth * channel_elements;
-
-    // init output space
-    THFloatTensor_zero(grads_image);
-
-    // data pointer
-    const float * grads_data = THFloatTensor_data(grads);
-    const float * boxes_data = THFloatTensor_data(boxes);
-    const int * box_index_data = THIntTensor_data(box_index);
-    float * grads_image_data = THFloatTensor_data(grads_image);
-
-    for (int b = 0; b < num_boxes; ++b) {
-        const float * box = boxes_data + b * 4;
-        const float y1 = box[0];
-        const float x1 = box[1];
-        const float y2 = box[2];
-        const float x2 = box[3];
-
-        const int b_in = box_index_data[b];
-        if (b_in < 0 || b_in >= batch_size) {
-            printf("Error: batch_index %d out of range [0, %d)\n", b_in, batch_size);
-            exit(-1);
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                              : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
-                             : 0;
-
-        for (int y = 0; y < crop_height; ++y)
-        {
-            const float in_y = (crop_height > 1)
-                                   ? y1 * (image_height - 1) + y * height_scale
-                                   : 0.5 * (y1 + y2) * (image_height - 1);
-            if (in_y < 0 || in_y > image_height - 1)
-            {
-                continue;
-            }
-            const int top_y_index = floorf(in_y);
-            const int bottom_y_index = ceilf(in_y);
-            const float y_lerp = in_y - top_y_index;
-
-            for (int x = 0; x < crop_width; ++x)
-            {
-                const float in_x = (crop_width > 1)
-                                       ? x1 * (image_width - 1) + x * width_scale
-                                       : 0.5 * (x1 + x2) * (image_width - 1);
-                if (in_x < 0 || in_x > image_width - 1)
-                {
-                    continue;
-                }
-                const int left_x_index = floorf(in_x);
-                const int right_x_index = ceilf(in_x);
-                const float x_lerp = in_x - left_x_index;
-
-                for (int d = 0; d < depth; ++d)
-                {   
-                    float *pimage = grads_image_data + b_in * image_elements + d * image_channel_elements;
-                    const float grad_val = grads_data[crop_elements * b + channel_elements * d + y * crop_width + x];
-
-                    const float dtop = (1 - y_lerp) * grad_val;
-                    pimage[top_y_index * image_width + left_x_index] += (1 - x_lerp) * dtop;
-                    pimage[top_y_index * image_width + right_x_index] += x_lerp * dtop;
-
-                    const float dbottom = y_lerp * grad_val;
-                    pimage[bottom_y_index * image_width + left_x_index] += (1 - x_lerp) * dbottom;
-                    pimage[bottom_y_index * image_width + right_x_index] += x_lerp * dbottom;
-                }   // end d
-            }   // end x
-        }   // end y
-    }   // end b
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.h b/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.h
deleted file mode 100644
index d494865..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize.h
+++ /dev/null
@@ -1,16 +0,0 @@
-void crop_and_resize_forward(
-    THFloatTensor * image,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THFloatTensor * crops
-);
-
-void crop_and_resize_backward(
-    THFloatTensor * grads,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    THFloatTensor * grads_image // resize to [bsize, c, hc, wc]
-);
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.c b/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.c
deleted file mode 100644
index dd347c6..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.c
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <THC/THC.h>
-#include "cuda/crop_and_resize_kernel.h"
-
-extern THCState *state;
-
-
-void crop_and_resize_gpu_forward(
-    THCudaTensor * image,
-    THCudaTensor * boxes,           // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THCudaTensor * crops
-) {
-    const int batch_size = THCudaTensor_size(state, image, 0);
-    const int depth = THCudaTensor_size(state, image, 1);
-    const int image_height = THCudaTensor_size(state, image, 2);
-    const int image_width = THCudaTensor_size(state, image, 3);
-
-    const int num_boxes = THCudaTensor_size(state, boxes, 0);
-
-    // init output space
-    THCudaTensor_resize4d(state, crops, num_boxes, depth, crop_height, crop_width);
-    THCudaTensor_zero(state, crops);
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    CropAndResizeLaucher(
-        THCudaTensor_data(state, image),
-        THCudaTensor_data(state, boxes),
-        THCudaIntTensor_data(state, box_index),
-        num_boxes, batch_size, image_height, image_width,
-        crop_height, crop_width, depth, extrapolation_value,
-        THCudaTensor_data(state, crops),
-        stream
-    );
-}
-
-
-void crop_and_resize_gpu_backward(
-    THCudaTensor * grads,
-    THCudaTensor * boxes,      // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    THCudaTensor * grads_image // resize to [bsize, c, hc, wc]
-) {
-    // shape
-    const int batch_size = THCudaTensor_size(state, grads_image, 0);
-    const int depth = THCudaTensor_size(state, grads_image, 1);
-    const int image_height = THCudaTensor_size(state, grads_image, 2);
-    const int image_width = THCudaTensor_size(state, grads_image, 3);
-
-    const int num_boxes = THCudaTensor_size(state, grads, 0);
-    const int crop_height = THCudaTensor_size(state, grads, 2);
-    const int crop_width = THCudaTensor_size(state, grads, 3);
-
-    // init output space
-    THCudaTensor_zero(state, grads_image);
-
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    CropAndResizeBackpropImageLaucher(
-        THCudaTensor_data(state, grads),
-        THCudaTensor_data(state, boxes),
-        THCudaIntTensor_data(state, box_index),
-        num_boxes, batch_size, image_height, image_width,
-        crop_height, crop_width, depth,
-        THCudaTensor_data(state, grads_image),
-        stream
-    );
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.h b/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.h
deleted file mode 100644
index c2a64cf..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/crop_and_resize_gpu.h
+++ /dev/null
@@ -1,16 +0,0 @@
-void crop_and_resize_gpu_forward(
-    THCudaTensor * image,
-    THCudaTensor * boxes,           // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THCudaTensor * crops
-);
-
-void crop_and_resize_gpu_backward(
-    THCudaTensor * grads,
-    THCudaTensor * boxes,      // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    THCudaTensor * grads_image // resize to [bsize, c, hc, wc]
-);
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/cuda/backup.cu b/cuda_functions/roi_align_2D/roi_align/src/cuda/backup.cu
deleted file mode 100644
index 3a1ab8b..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/cuda/backup.cu
+++ /dev/null
@@ -1,243 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include "crop_and_resize_kernel.h"
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-     i += blockDim.x * gridDim.x)
-
-
-__global__
-void CropAndResizeKernel(
-    const int nthreads, const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
- //       printf("INIT CUDA SCRIPT %f \n", idx);
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0;
-
-        const float in_y = (crop_height > 1)
-                                ? y1 * (image_height - 1) + y * height_scale
-                                : 0.5 * (y1 + y2) * (image_height - 1);
-        if (in_y < 0 || in_y > image_height - 1)
-        {
-            crops_ptr[out_idx] = extrapolation_value;
-            continue;
-        }
-
-        const float in_x = (crop_width > 1)
-                                ? x1 * (image_width - 1) + x * width_scale
-                                : 0.5 * (x1 + x2) * (image_width - 1);
-        if (in_x < 0 || in_x > image_width - 1)
-        {
-            crops_ptr[out_idx] = extrapolation_value;
-            continue;
-        }
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        const float *pimage = image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float top_left = pimage[top_y_index * image_width + left_x_index];
-        const float top_right = pimage[top_y_index * image_width + right_x_index];
-        const float bottom_left = pimage[bottom_y_index * image_width + left_x_index];
-        const float bottom_right = pimage[bottom_y_index * image_width + right_x_index];
-       // if (top_left == 0){
-         //   const float top = top_right}
-       // elif (top_right == 0){
-         //   const float top = top_left}
-       // else{
-            const float top = top_left + (top_right - top_left) * x_lerp;
-            //}
-
-        //if (bottom_left == 0){
-       //     const float bottom = bottom_right}
-       // elif (bottom_right == 0){
-        //    const float bottom = bottom_left}
-       // else{
-        const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-        //}
-
-        //if (top == 0){
-         //    crops_ptr[out_idx] = bottom }
-       // elif (bottom == 0){
-        //    crops_ptr[out_idx] = top
-            //}
-       // else{
-        crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
-        //}
-    }
-}
-
-__global__
-void CropAndResizeBackpropImageKernel(
-    const int nthreads, const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) : 0;
-
-        const float in_y = (crop_height > 1)
-                                ? y1 * (image_height - 1) + y * height_scale
-                                : 0.5 * (y1 + y2) * (image_height - 1);
-        if (in_y < 0 || in_y > image_height - 1)
-        {
-            continue;
-        }
-
-        const float in_x = (crop_width > 1)
-                                ? x1 * (image_width - 1) + x * width_scale
-                                : 0.5 * (x1 + x2) * (image_width - 1);
-        if (in_x < 0 || in_x > image_width - 1)
-        {
-            continue;
-        }
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        float *pimage = grads_image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + top_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dtop
-        );
-        atomicAdd(
-            pimage + top_y_index * image_width + right_x_index,
-            x_lerp * dtop
-        );
-
-        const float dbottom = y_lerp * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + bottom_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dbottom
-        );
-        atomicAdd(
-            pimage + bottom_y_index * image_width + right_x_index,
-            x_lerp * dbottom
-        );
-    }
-}
-
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, image_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, extrapolation_value, crops_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
-
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeBackpropImageKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, grads_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, grads_image_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu b/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu
deleted file mode 100644
index 0702551..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu
+++ /dev/null
@@ -1,250 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include "crop_and_resize_kernel.h"
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-     i += blockDim.x * gridDim.x)
-
-
-__global__
-void CropAndResizeKernel(
-    const int nthreads, const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        //printf("start %i \n", idx);
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-         const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height) / (crop_height)
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width) / (crop_width) : 0;
-
-
-        float tmp_in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height);
-
-        if (tmp_in_y > image_height - 1)
- 	{
-  		 tmp_in_y = image_height - 1;
-       }
-        if (tmp_in_y < 0)
-        {
-            tmp_in_y = 0;
-        }
-	const float in_y = tmp_in_y;
-
-        float tmp_in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-
-        if (tmp_in_x > image_width - 1)
- 	{
-	 tmp_in_x = image_width - 1;
-	}
-        if (tmp_in_x < 0)
-        {
-            tmp_in_x= 0;
-        }
-	const float in_x = tmp_in_x;
-
-        //printf("height_scale %f \n", height_scale);
-        //printf("width_scale %f \n", width_scale);
-        //printf("in_x %f \n", in_x);
-        //printf("in_y %f \n", in_y);
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        const float *pimage = image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float top_left = pimage[top_y_index * image_width + left_x_index];
-        const float top_right = pimage[top_y_index * image_width + right_x_index];
-        const float bottom_left = pimage[bottom_y_index * image_width + left_x_index];
-        const float bottom_right = pimage[bottom_y_index * image_width + right_x_index];
-
-        const float top = top_left + (top_right - top_left) * x_lerp;
-        const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-        crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
-    }
-}
-
-__global__
-void CropAndResizeBackpropImageKernel(
-    const int nthreads, const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height ) / (crop_height )
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width ) / (crop_width ) : 0;
-
-        float tmp_in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height);
-
-        if (tmp_in_y > image_height - 1)
- 	{
- 		 tmp_in_y = image_height - 1;
-       }
-        if (tmp_in_y < 0)
-        {
-            tmp_in_y = 0;
-        }
-	const float in_y = tmp_in_y;
-
-        float tmp_in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-
-        if (tmp_in_x > image_width - 1)
- 	{
-	 tmp_in_x = image_width - 1;
-	}
-        if (tmp_in_x < 0)
-        {
-            tmp_in_x= 0;
-        }
-	const float in_x = tmp_in_x;
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        float *pimage = grads_image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + top_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dtop
-        );
-        atomicAdd(
-            pimage + top_y_index * image_width + right_x_index,
-            x_lerp * dtop
-        );
-
-        const float dbottom = y_lerp * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + bottom_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dbottom
-        );
-        atomicAdd(
-            pimage + bottom_y_index * image_width + right_x_index,
-            x_lerp * dbottom
-        );
-    }
-}
-
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, image_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, extrapolation_value, crops_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError in Roi Align () failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
-
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeBackpropImageKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, grads_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, grads_image_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed in Roi Align : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu.o b/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu.o
deleted file mode 100644
index 2f1a1b9..0000000
Binary files a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.cu.o and /dev/null differ
diff --git a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.h b/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.h
deleted file mode 100644
index 893aee1..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/cuda/crop_and_resize_kernel.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _CropAndResize_Kernel
-#define _CropAndResize_Kernel
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream);
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr, cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/cuda_functions/roi_align_2D/roi_align/src/cuda/fix.cu b/cuda_functions/roi_align_2D/roi_align/src/cuda/fix.cu
deleted file mode 100644
index 6eea4a8..0000000
--- a/cuda_functions/roi_align_2D/roi_align/src/cuda/fix.cu
+++ /dev/null
@@ -1,243 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include "crop_and_resize_kernel.h"
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-     i += blockDim.x * gridDim.x)
-
-
-__global__
-void CropAndResizeKernel(
-    const int nthreads, const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
- //       printf("INIT CUDA SCRIPT %f \n", idx);
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height ) / (crop_height )
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width) / (crop_width ) : 0;
-
-        const float in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height );
-        if (in_y < 0 || in_y > image_height )
-        {
-            crops_ptr[out_idx] = extrapolation_value;
-            continue;
-        }
-
-        const float in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-        if (in_x < 0 || in_x > image_width )
-        {
-            crops_ptr[out_idx] = extrapolation_value;
-            continue;
-        }
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        const float *pimage = image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float top_left = pimage[top_y_index * image_width + left_x_index];
-        const float top_right = pimage[top_y_index * image_width + right_x_index];
-        const float bottom_left = pimage[bottom_y_index * image_width + left_x_index];
-        const float bottom_right = pimage[bottom_y_index * image_width + right_x_index];
-       // if (top_left == 0){
-         //   const float top = top_right}
-       // elif (top_right == 0){
-         //   const float top = top_left}
-       // else{
-            const float top = top_left + (top_right - top_left) * x_lerp;
-            //}
-
-        //if (bottom_left == 0){
-       //     const float bottom = bottom_right}
-       // elif (bottom_right == 0){
-        //    const float bottom = bottom_left}
-       // else{
-        const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-        //}
-
-        //if (top == 0){
-         //    crops_ptr[out_idx] = bottom }
-       // elif (bottom == 0){
-        //    crops_ptr[out_idx] = top
-            //}
-       // else{
-        crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
-        //}
-    }
-}
-
-__global__
-void CropAndResizeBackpropImageKernel(
-    const int nthreads, const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        int idx = out_idx;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 4];
-        const float x1 = boxes_ptr[b * 4 + 1];
-        const float y2 = boxes_ptr[b * 4 + 2];
-        const float x2 = boxes_ptr[b * 4 + 3];
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height ) / (crop_height )
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width ) / (crop_width ) : 0;
-
-        const float in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height );
-        if (in_y < 0 || in_y > image_height )
-        {
-            continue;
-        }
-
-        const float in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-        if (in_x < 0 || in_x > image_width )
-        {
-            continue;
-        }
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        float *pimage = grads_image_ptr + (b_in * depth + d) * image_height * image_width;
-        const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + top_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dtop
-        );
-        atomicAdd(
-            pimage + top_y_index * image_width + right_x_index,
-            x_lerp * dtop
-        );
-
-        const float dbottom = y_lerp * grads_ptr[out_idx];
-        atomicAdd(
-            pimage + bottom_y_index * image_width + left_x_index,
-            (1 - x_lerp) * dbottom
-        );
-        atomicAdd(
-            pimage + bottom_y_index * image_width + right_x_index,
-            x_lerp * dbottom
-        );
-    }
-}
-
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, image_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, extrapolation_value, crops_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
-
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
-    float *grads_image_ptr, cudaStream_t stream)
-{
-    const int total_count = num_boxes * crop_height * crop_width * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeBackpropImageKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, grads_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width,
-            crop_height, crop_width, depth, grads_image_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/__init__.py b/cuda_functions/roi_align_3D/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_3D/roi_align/__init__.py b/cuda_functions/roi_align_3D/roi_align/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_3D/roi_align/_ext/__init__.py b/cuda_functions/roi_align_3D/roi_align/_ext/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/._crop_and_resize.so.swp b/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/._crop_and_resize.so.swp
deleted file mode 100644
index 3db0ea4..0000000
Binary files a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/._crop_and_resize.so.swp and /dev/null differ
diff --git a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/__init__.py b/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/__init__.py
deleted file mode 100644
index 4486c09..0000000
--- a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-from torch.utils.ffi import _wrap_function
-from ._crop_and_resize import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
diff --git a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/_crop_and_resize.so b/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/_crop_and_resize.so
deleted file mode 100755
index 81dc147..0000000
Binary files a/cuda_functions/roi_align_3D/roi_align/_ext/crop_and_resize/_crop_and_resize.so and /dev/null differ
diff --git a/cuda_functions/roi_align_3D/roi_align/build.py b/cuda_functions/roi_align_3D/roi_align/build.py
deleted file mode 100755
index 3798d82..0000000
--- a/cuda_functions/roi_align_3D/roi_align/build.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import torch
-from torch.utils.ffi import create_extension
-
-
-sources = ['src/crop_and_resize.c']
-headers = ['src/crop_and_resize.h']
-defines = []
-with_cuda = False
-
-extra_objects = []
-if torch.cuda.is_available():
-    print('Including CUDA code.')
-    sources += ['src/crop_and_resize_gpu.c']
-    headers += ['src/crop_and_resize_gpu.h']
-    defines += [('WITH_CUDA', None)]
-    extra_objects += ['src/cuda/crop_and_resize_kernel.cu.o']
-    with_cuda = True
-
-extra_compile_args = ['-fopenmp', '-std=c99']
-
-this_file = os.path.dirname(os.path.realpath(__file__))
-print(this_file)
-sources = [os.path.join(this_file, fname) for fname in sources]
-headers = [os.path.join(this_file, fname) for fname in headers]
-extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
-
-ffi = create_extension(
-    '_ext.crop_and_resize',
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_objects=extra_objects,
-    extra_compile_args=extra_compile_args
-)
-
-if __name__ == '__main__':
-    ffi.build()
diff --git a/cuda_functions/roi_align_3D/roi_align/crop_and_resize.py b/cuda_functions/roi_align_3D/roi_align/crop_and_resize.py
deleted file mode 100755
index cff4e90..0000000
--- a/cuda_functions/roi_align_3D/roi_align/crop_and_resize.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function
-
-from ._ext import crop_and_resize as _backend
-
-
-class CropAndResizeFunction(Function):
-
-    def __init__(self, crop_height, crop_width, crop_zdepth, extrapolation_value=0):
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.crop_zdepth = crop_zdepth
-        self.extrapolation_value = extrapolation_value
-
-    def forward(self, image, boxes, box_ind):
-        crops = torch.zeros_like(image)
-
-        if image.is_cuda:
-            _backend.crop_and_resize_gpu_forward(
-                image, boxes, box_ind,
-                self.extrapolation_value, self.crop_height, self.crop_width, self.crop_zdepth, crops)
-        else:
-            _backend.crop_and_resize_forward(
-                image, boxes, box_ind,
-                self.extrapolation_value, self.crop_height, self.crop_width, self.crop_zdepth, crops)
-
-        # save for backward
-        self.im_size = image.size()
-        self.save_for_backward(boxes, box_ind)
-
-        return crops
-
-    def backward(self, grad_outputs):
-        boxes, box_ind = self.saved_tensors
-
-        grad_outputs = grad_outputs.contiguous()
-        grad_image = torch.zeros_like(grad_outputs).resize_(*self.im_size)
-
-        if grad_outputs.is_cuda:
-            _backend.crop_and_resize_gpu_backward(
-                grad_outputs, boxes, box_ind, grad_image
-            )
-        else:
-            _backend.crop_and_resize_backward(
-                grad_outputs, boxes, box_ind, grad_image
-            )
-
-        return grad_image, None, None
-
-
-class CropAndResize(nn.Module):
-    """
-    Crop and resize ported from tensorflow
-    See more details on https://www.tensorflow.org/api_docs/python/tf/image/crop_and_resize
-    """
-
-    def __init__(self, crop_height, crop_width, crop_zdepth, extrapolation_value=0):
-        super(CropAndResize, self).__init__()
-
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.crop_zdepth = crop_zdepth
-        self.extrapolation_value = extrapolation_value
-
-    def forward(self, image, boxes, box_ind):
-        return CropAndResizeFunction(self.crop_height, self.crop_width, self.crop_zdepth, self.extrapolation_value)(image, boxes, box_ind)
diff --git a/cuda_functions/roi_align_3D/roi_align/roi_align.py b/cuda_functions/roi_align_3D/roi_align/roi_align.py
deleted file mode 100644
index 6931539..0000000
--- a/cuda_functions/roi_align_3D/roi_align/roi_align.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from torch import nn
-
-from .crop_and_resize import CropAndResizeFunction, CropAndResize
-
-
-class RoIAlign(nn.Module):
-
-    def __init__(self, crop_height, crop_width, extrapolation_value=0, transform_fpcoor=True):
-        super(RoIAlign, self).__init__()
-
-        self.crop_height = crop_height
-        self.crop_width = crop_width
-        self.extrapolation_value = extrapolation_value
-        self.transform_fpcoor = transform_fpcoor
-
-    def forward(self, featuremap, boxes, box_ind):
-        """
-        RoIAlign based on crop_and_resize.
-        See more details on https://github.com/ppwwyyxx/tensorpack/blob/6d5ba6a970710eaaa14b89d24aace179eb8ee1af/examples/FasterRCNN/model.py#L301
-        :param featuremap: NxCxHxW
-        :param boxes: Mx4 float box with (x1, y1, x2, y2) **without normalization**
-        :param box_ind: M
-        :return: MxCxoHxoW
-        """
-        x1, y1, x2, y2 = torch.split(boxes, 1, dim=1)
-        image_height, image_width = featuremap.size()[2:4]
-
-        if self.transform_fpcoor:
-            spacing_w = (x2 - x1) / float(self.crop_width)
-            spacing_h = (y2 - y1) / float(self.crop_height)
-
-            nx0 = (x1 + spacing_w / 2 - 0.5) / float(image_width - 1)
-            ny0 = (y1 + spacing_h / 2 - 0.5) / float(image_height - 1)
-            nw = spacing_w * float(self.crop_width - 1) / float(image_width - 1)
-            nh = spacing_h * float(self.crop_height - 1) / float(image_height - 1)
-
-            boxes = torch.cat((ny0, nx0, ny0 + nh, nx0 + nw), 1)
-        else:
-            x1 = x1 / float(image_width - 1)
-            x2 = x2 / float(image_width - 1)
-            y1 = y1 / float(image_height - 1)
-            y2 = y2 / float(image_height - 1)
-            boxes = torch.cat((y1, x1, y2, x2), 1)
-
-        boxes = boxes.detach().contiguous()
-        box_ind = box_ind.detach()
-        return CropAndResizeFunction(self.crop_height, self.crop_width, self.extrapolation_value)(featuremap, boxes, box_ind)
diff --git a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.c b/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.c
deleted file mode 100644
index e1fce67..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.c
+++ /dev/null
@@ -1,252 +0,0 @@
-#include <TH/TH.h>
-#include <stdio.h>
-#include <math.h>
-
-
-void CropAndResizePerBox(
-    const float * image_data, 
-    const int batch_size,
-    const int depth,
-    const int image_height,
-    const int image_width,
-
-    const float * boxes_data, 
-    const int * box_index_data,
-    const int start_box, 
-    const int limit_box,
-
-    float * corps_data,
-    const int crop_height,
-    const int crop_width,
-    const float extrapolation_value
-) {
-    const int image_channel_elements = image_height * image_width;
-    const int image_elements = depth * image_channel_elements;
-
-    const int channel_elements = crop_height * crop_width;
-    const int crop_elements = depth * channel_elements;
-
-    int b;
-    #pragma omp parallel for
-    for (b = start_box; b < limit_box; ++b) {
-        const float * box = boxes_data + b * 4;
-        const float y1 = box[0];
-        const float x1 = box[1];
-        const float y2 = box[2];
-        const float x2 = box[3];
-
-        const int b_in = box_index_data[b];
-        if (b_in < 0 || b_in >= batch_size) {
-            printf("Error: batch_index %d out of range [0, %d)\n", b_in, batch_size);
-            exit(-1);
-        }
-
-        const float height_scale =
-            (crop_height > 1)
-                ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
-                             : 0;
-
-        for (int y = 0; y < crop_height; ++y)
-        {
-            const float in_y = (crop_height > 1)
-                                   ? y1 * (image_height - 1) + y * height_scale
-                                   : 0.5 * (y1 + y2) * (image_height - 1);
-
-            if (in_y < 0 || in_y > image_height - 1)
-            {
-                for (int x = 0; x < crop_width; ++x)
-                {
-                    for (int d = 0; d < depth; ++d)
-                    {
-                        // crops(b, y, x, d) = extrapolation_value;
-                        corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = extrapolation_value;
-                    }
-                }
-                continue;
-            }
-            
-            const int top_y_index = floorf(in_y);
-            const int bottom_y_index = ceilf(in_y);
-            const float y_lerp = in_y - top_y_index;
-
-            for (int x = 0; x < crop_width; ++x)
-            {
-                const float in_x = (crop_width > 1)
-                                       ? x1 * (image_width - 1) + x * width_scale
-                                       : 0.5 * (x1 + x2) * (image_width - 1);
-                if (in_x < 0 || in_x > image_width - 1)
-                {
-                    for (int d = 0; d < depth; ++d)
-                    {
-                        corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = extrapolation_value;
-                    }
-                    continue;
-                }
-            
-                const int left_x_index = floorf(in_x);
-                const int right_x_index = ceilf(in_x);
-                const float x_lerp = in_x - left_x_index;
-
-                for (int d = 0; d < depth; ++d)
-                {   
-                    const float *pimage = image_data + b_in * image_elements + d * image_channel_elements;
-
-                    const float top_left = pimage[top_y_index * image_width + left_x_index];
-                    const float top_right = pimage[top_y_index * image_width + right_x_index];
-                    const float bottom_left = pimage[bottom_y_index * image_width + left_x_index];
-                    const float bottom_right = pimage[bottom_y_index * image_width + right_x_index];
-                    
-                    const float top = top_left + (top_right - top_left) * x_lerp;
-                    const float bottom =
-                        bottom_left + (bottom_right - bottom_left) * x_lerp;
-                        
-                    corps_data[crop_elements * b + channel_elements * d + y * crop_width + x] = top + (bottom - top) * y_lerp;
-                }
-            }   // end for x
-        }   // end for y
-    }   // end for b
-
-}
-
-
-void crop_and_resize_forward(
-    THFloatTensor * image,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THFloatTensor * crops
-) {
-    const int batch_size = image->size[0];
-    const int depth = image->size[1];
-    const int image_height = image->size[2];
-    const int image_width = image->size[3];
-
-    const int num_boxes = boxes->size[0];
-
-    // init output space
-    THFloatTensor_resize4d(crops, num_boxes, depth, crop_height, crop_width);
-    THFloatTensor_zero(crops);
-
-    // crop_and_resize for each box
-    CropAndResizePerBox(
-        THFloatTensor_data(image),
-        batch_size,
-        depth,
-        image_height,
-        image_width,
-
-        THFloatTensor_data(boxes),
-        THIntTensor_data(box_index),
-        0,
-        num_boxes,
-
-        THFloatTensor_data(crops),
-        crop_height,
-        crop_width,
-        extrapolation_value
-    );
-
-}
-
-
-void crop_and_resize_backward(
-    THFloatTensor * grads,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    THFloatTensor * grads_image // resize to [bsize, c, hc, wc]
-)
-{   
-    // shape
-    const int batch_size = grads_image->size[0];
-    const int depth = grads_image->size[1];
-    const int image_height = grads_image->size[2];
-    const int image_width = grads_image->size[3];
-
-    const int num_boxes = grads->size[0];
-    const int crop_height = grads->size[2];
-    const int crop_width = grads->size[3];
-
-    // n_elements
-    const int image_channel_elements = image_height * image_width;
-    const int image_elements = depth * image_channel_elements;
-
-    const int channel_elements = crop_height * crop_width;
-    const int crop_elements = depth * channel_elements;
-
-    // init output space
-    THFloatTensor_zero(grads_image);
-
-    // data pointer
-    const float * grads_data = THFloatTensor_data(grads);
-    const float * boxes_data = THFloatTensor_data(boxes);
-    const int * box_index_data = THIntTensor_data(box_index);
-    float * grads_image_data = THFloatTensor_data(grads_image);
-
-    for (int b = 0; b < num_boxes; ++b) {
-        const float * box = boxes_data + b * 4;
-        const float y1 = box[0];
-        const float x1 = box[1];
-        const float y2 = box[2];
-        const float x2 = box[3];
-
-        const int b_in = box_index_data[b];
-        if (b_in < 0 || b_in >= batch_size) {
-            printf("Error: batch_index %d out of range [0, %d)\n", b_in, batch_size);
-            exit(-1);
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height - 1) / (crop_height - 1)
-                              : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1)
-                             : 0;
-
-        for (int y = 0; y < crop_height; ++y)
-        {
-            const float in_y = (crop_height > 1)
-                                   ? y1 * (image_height - 1) + y * height_scale
-                                   : 0.5 * (y1 + y2) * (image_height - 1);
-            if (in_y < 0 || in_y > image_height - 1)
-            {
-                continue;
-            }
-            const int top_y_index = floorf(in_y);
-            const int bottom_y_index = ceilf(in_y);
-            const float y_lerp = in_y - top_y_index;
-
-            for (int x = 0; x < crop_width; ++x)
-            {
-                const float in_x = (crop_width > 1)
-                                       ? x1 * (image_width - 1) + x * width_scale
-                                       : 0.5 * (x1 + x2) * (image_width - 1);
-                if (in_x < 0 || in_x > image_width - 1)
-                {
-                    continue;
-                }
-                const int left_x_index = floorf(in_x);
-                const int right_x_index = ceilf(in_x);
-                const float x_lerp = in_x - left_x_index;
-
-                for (int d = 0; d < depth; ++d)
-                {   
-                    float *pimage = grads_image_data + b_in * image_elements + d * image_channel_elements;
-                    const float grad_val = grads_data[crop_elements * b + channel_elements * d + y * crop_width + x];
-
-                    const float dtop = (1 - y_lerp) * grad_val;
-                    pimage[top_y_index * image_width + left_x_index] += (1 - x_lerp) * dtop;
-                    pimage[top_y_index * image_width + right_x_index] += x_lerp * dtop;
-
-                    const float dbottom = y_lerp * grad_val;
-                    pimage[bottom_y_index * image_width + left_x_index] += (1 - x_lerp) * dbottom;
-                    pimage[bottom_y_index * image_width + right_x_index] += x_lerp * dbottom;
-                }   // end d
-            }   // end x
-        }   // end y
-    }   // end b
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.h b/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.h
deleted file mode 100644
index d494865..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize.h
+++ /dev/null
@@ -1,16 +0,0 @@
-void crop_and_resize_forward(
-    THFloatTensor * image,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    THFloatTensor * crops
-);
-
-void crop_and_resize_backward(
-    THFloatTensor * grads,
-    THFloatTensor * boxes,      // [y1, x1, y2, x2]
-    THIntTensor * box_index,    // range in [0, batch_size)
-    THFloatTensor * grads_image // resize to [bsize, c, hc, wc]
-);
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.c b/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.c
deleted file mode 100644
index 8e07b3d..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.c
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <THC/THC.h>
-#include "cuda/crop_and_resize_kernel.h"
-
-extern THCState *state;
-
-
-void crop_and_resize_gpu_forward(
-    THCudaTensor * image,
-    THCudaTensor * boxes,           // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    const int crop_zdepth,
-    THCudaTensor * crops
-) {
-    const int batch_size = THCudaTensor_size(state, image, 0);
-    const int depth = THCudaTensor_size(state, image, 1);
-    const int image_height = THCudaTensor_size(state, image, 2);
-    const int image_width = THCudaTensor_size(state, image, 3);
-    const int image_zdepth = THCudaTensor_size(state, image, 4);
-
-    const int num_boxes = THCudaTensor_size(state, boxes, 0);
-
-    // init output space
-    THCudaTensor_resize5d(state, crops, num_boxes, depth, crop_height, crop_width, crop_zdepth);
-    THCudaTensor_zero(state, crops);
-
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    CropAndResizeLaucher(
-        THCudaTensor_data(state, image),
-        THCudaTensor_data(state, boxes),
-        THCudaIntTensor_data(state, box_index),
-        num_boxes, batch_size, image_height, image_width, image_zdepth,
-        crop_height, crop_width, crop_zdepth, depth, extrapolation_value,
-        THCudaTensor_data(state, crops),
-        stream
-    );
-}
-
-
-void crop_and_resize_gpu_backward(
-    THCudaTensor * grads,
-    THCudaTensor * boxes,      // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    THCudaTensor * grads_image // resize to [bsize, c, hc, wc]
-) {
-    // shape
-    const int batch_size = THCudaTensor_size(state, grads_image, 0);
-    const int depth = THCudaTensor_size(state, grads_image, 1);
-    const int image_height = THCudaTensor_size(state, grads_image, 2);
-    const int image_width = THCudaTensor_size(state, grads_image, 3);
-    const int image_zdepth = THCudaTensor_size(state, grads_image, 4);
-
-    const int num_boxes = THCudaTensor_size(state, grads, 0);
-    const int crop_height = THCudaTensor_size(state, grads, 2);
-    const int crop_width = THCudaTensor_size(state, grads, 3);
-    const int crop_zdepth = THCudaTensor_size(state, grads, 4);
-
-    // init output space
-    THCudaTensor_zero(state, grads_image);
-
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    CropAndResizeBackpropImageLaucher(
-        THCudaTensor_data(state, grads),
-        THCudaTensor_data(state, boxes),
-        THCudaIntTensor_data(state, box_index),
-        num_boxes, batch_size, image_height, image_width, image_zdepth,
-        crop_height, crop_width, crop_zdepth, depth,
-        THCudaTensor_data(state, grads_image),
-        stream
-    );
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.h b/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.h
deleted file mode 100644
index dd2eb5a..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/crop_and_resize_gpu.h
+++ /dev/null
@@ -1,17 +0,0 @@
-void crop_and_resize_gpu_forward(
-    THCudaTensor * image,
-    THCudaTensor * boxes,           // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    const float extrapolation_value,
-    const int crop_height,
-    const int crop_width,
-    const int crop_zdepth,
-    THCudaTensor * crops
-);
-
-void crop_and_resize_gpu_backward(
-    THCudaTensor * grads,
-    THCudaTensor * boxes,      // [y1, x1, y2, x2]
-    THCudaIntTensor * box_index,    // range in [0, batch_size)
-    THCudaTensor * grads_image // resize to [bsize, c, hc, wc]
-);
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu b/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu
deleted file mode 100644
index e381dab..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu
+++ /dev/null
@@ -1,361 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include "crop_and_resize_kernel.h"
-#include <stdio.h>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-     i += blockDim.x * gridDim.x)
-
-
-__global__
-void CropAndResizeKernel(
-    const int nthreads, const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float extrapolation_value, float *crops_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads) // nthreads = total_count!
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b)) position in out grid!!!
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))   NCYX yes seems like xy is exchanged!
-        // NCHWZ: out_idx = z + crop_zdepth * (w + crop_width * (h + crop_height * (d + depth * b))) z == last.
-
-        int idx = out_idx;
-
-        const int z = idx % crop_zdepth;
-        idx /= crop_zdepth;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-
-        const int d = idx % depth;
-        const int b = idx / depth; // batch
-
-        const float y1 = boxes_ptr[b * 6]; // b = batch -> 0 // normalized coords!!
-        const float x1 = boxes_ptr[b * 6 + 1];
-        const float y2 = boxes_ptr[b * 6 + 2];
-        const float x2 = boxes_ptr[b * 6 + 3];
-        const float z1 = boxes_ptr[b * 6 + 4];
-        const float z2 = boxes_ptr[b * 6 + 5];
-
-        const int b_in = box_ind_ptr[b]; // == 0 in my case.
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        // e.g. (0.4-0.3)*100 = 10 / 7 = 1.3 ratio proposal_size / crops_size. one cell in crops has size 1.3 in_pixel.
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1)  * (image_height ) / (crop_height ) : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width ) / (crop_width ) : 0;
-
-        const float zdepth_scale =
-            (crop_zdepth > 1) ? (z2 - z1) * (image_zdepth ) / (crop_zdepth ) : 0;
-
-
-        // e.g.  0.3*100 + 5 * 1.3 . Which floating coordinate is going into cell?
-        // e.g. y: 30 (lower bound prop) + 7.5 (current crop position * scale)
-
-
-        float tmp_in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height);
-
-        if (tmp_in_y > image_height - 1)
-        {
-         tmp_in_y = image_height - 1;
-        }
-        if (tmp_in_y < 0)
-        {
-         tmp_in_y = 0;
-        }
-        const float in_y = tmp_in_y;
-
-
-        float tmp_in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-
-        if (tmp_in_x > image_width - 1)
-        {
-         tmp_in_x = image_width - 1;
-        }
-        if (tmp_in_x < 0)
-        {
-         tmp_in_x= 0;
-        }
-	    const float in_x = tmp_in_x;
-
-
-        float tmp_in_z = (crop_zdepth > 1)
-                            ? z1 * (image_zdepth ) + z * zdepth_scale + zdepth_scale/2 - 0.5
-                            : 0.5 * (z1 + z2) * (image_zdepth);
-
-        if (tmp_in_z > image_zdepth - 1)
-        {
-         tmp_in_z = image_zdepth - 1;
-        }
-        if (tmp_in_z < 0)
-        {
-         tmp_in_z= 0;
-        }
-        const float in_z = tmp_in_z;
-
-        // this is just rounding of the floating coord of grid cell. The distances to nearest grid points are
-        // memorized (lerp) to be used for bilinear interpolation later.
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index; //
-
-        const int front_z_index = floorf(in_z);
-        const int back_z_index = ceilf(in_z);
-        const float z_lerp = in_z - front_z_index;
-
-
-        // address of image + going to the right feature map.
-        const float *pimage = image_ptr + (b_in * depth + d) * image_height * image_width * image_zdepth;
-
-        // 1D address of corner points of in_coords to grid cell.
-        // NCHWZ: out_idx = z + crop_zdepth * (w + crop_width * (h + crop_height * (d + depth * b))) z == last.
-        const float top_left_front = pimage[front_z_index + image_zdepth * (left_x_index + image_width * top_y_index)];
-        const float top_right_front = pimage[front_z_index + image_zdepth * (right_x_index + image_width * top_y_index)];
-        const float bottom_left_front = pimage[front_z_index + image_zdepth * (left_x_index + image_width * bottom_y_index)];
-        const float bottom_right_front = pimage[front_z_index + image_zdepth * (right_x_index + image_width * bottom_y_index)];
-        const float top_left_back = pimage[back_z_index + image_zdepth * (left_x_index + image_width * top_y_index)];
-        const float top_right_back = pimage[back_z_index + image_zdepth * (right_x_index + image_width * top_y_index)];
-        const float bottom_left_back = pimage[back_z_index + image_zdepth * (left_x_index + image_width * bottom_y_index)];
-        const float bottom_right_back = pimage[back_z_index + image_zdepth * (right_x_index + image_width * bottom_y_index)];
-
-        // Bilinear Interpolation!! These are pixel values now! lerp is the interpolation distance!
-        // No Maxpool, only one point is sampled!
-        const float top_front = top_left_front + (top_right_front - top_left_front) * x_lerp;
-        const float bottom_front = bottom_left_front + (bottom_right_front - bottom_left_front) * x_lerp;
-        const float top_back = top_left_back + (top_right_back - top_left_back) * x_lerp;
-        const float bottom_back = bottom_left_back + (bottom_right_back - bottom_left_back) * x_lerp;
-
-        const float front = top_front + (bottom_front - top_front) * y_lerp;
-        const float back = top_back + (bottom_back - top_back) * y_lerp;
-
-        crops_ptr[out_idx] = front + (back - front) * z_lerp; // assign interpolated value to Grid cell!
-
-
-    }
-}
-
-__global__
-void CropAndResizeBackpropImageKernel(
-    const int nthreads, const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float *grads_image_ptr)
-{
-    CUDA_1D_KERNEL_LOOP(out_idx, nthreads)
-    {
-        // NHWC: out_idx = d + depth * (w + crop_width * (h + crop_height * b))
-        // NCHW: out_idx = w + crop_width * (h + crop_height * (d + depth * b))
-        // NCHWZ: out_idx = z + crop_zdepth * (w + crop_width * (h + crop_height * (d + depth * b))) z == last.
-        int idx = out_idx;
-
-        const int z = idx % crop_zdepth;
-        idx /= crop_zdepth;
-        const int x = idx % crop_width;
-        idx /= crop_width;
-        const int y = idx % crop_height;
-        idx /= crop_height;
-        const int d = idx % depth;
-        const int b = idx / depth;
-
-        const float y1 = boxes_ptr[b * 6]; // b = batch -> 0 // normalized coords!!
-        const float x1 = boxes_ptr[b * 6 + 1];
-        const float y2 = boxes_ptr[b * 6 + 2];
-        const float x2 = boxes_ptr[b * 6 + 3];
-        const float z1 = boxes_ptr[b * 6 + 4];
-        const float z2 = boxes_ptr[b * 6 + 5];
-
-
-        const int b_in = box_ind_ptr[b];
-        if (b_in < 0 || b_in >= batch)
-        {
-            continue;
-        }
-
-        const float height_scale =
-            (crop_height > 1) ? (y2 - y1) * (image_height ) / (crop_height )
-                                : 0;
-        const float width_scale =
-            (crop_width > 1) ? (x2 - x1) * (image_width ) / (crop_width ) : 0;
-
-        const float zdepth_scale =
-            (crop_zdepth > 1) ? (z2 - z1) * (image_zdepth ) / (crop_zdepth ) : 0;
-
-
-        float tmp_in_y = (crop_height > 1)
-                                ? y1 * (image_height ) + y * height_scale + height_scale/2 - 0.5
-                                : 0.5 * (y1 + y2) * (image_height);
-        if (tmp_in_y > image_height - 1)
-        {
-         tmp_in_y = image_height - 1;
-        }
-        if (tmp_in_y < 0)
-        {
-         tmp_in_y = 0;
-        }
-        const float in_y = tmp_in_y;
-
-
-        float tmp_in_x = (crop_width > 1)
-                                ? x1 * (image_width ) + x * width_scale + width_scale/2 - 0.5
-                                : 0.5 * (x1 + x2) * (image_width );
-        if (tmp_in_x > image_width - 1)
-        {
-         tmp_in_x = image_width - 1;
-        }
-        if (tmp_in_x < 0)
-        {
-         tmp_in_x= 0;
-        }
-	    const float in_x = tmp_in_x;
-
-
-        float tmp_in_z = (crop_zdepth > 1)
-                            ? z1 * (image_zdepth ) + z * zdepth_scale + zdepth_scale/2 - 0.5
-                            : 0.5 * (z1 + z2) * (image_zdepth);
-        if (tmp_in_z > image_zdepth - 1)
-        {
-         tmp_in_z = image_zdepth - 1;
-        }
-        if (tmp_in_z < 0)
-        {
-         tmp_in_z= 0;
-        }
-        const float in_z = tmp_in_z;
-
-        const int top_y_index = floorf(in_y);
-        const int bottom_y_index = ceilf(in_y);
-        const float y_lerp = in_y - top_y_index;
-
-        const int left_x_index = floorf(in_x);
-        const int right_x_index = ceilf(in_x);
-        const float x_lerp = in_x - left_x_index;
-
-        const int front_z_index = floorf(in_z);
-        const int back_z_index = ceilf(in_z);
-        const float z_lerp = in_z - front_z_index;
-
-        float *pimage = grads_image_ptr + (b_in * depth + d) * image_height * image_width * image_zdepth;
-
-        // top left front
-        atomicAdd(
-            pimage + front_z_index + image_zdepth * (left_x_index + image_width * top_y_index),
-            (1 - x_lerp) * (1 - z_lerp) * (1 - y_lerp) * grads_ptr[out_idx]   // THIS IS BACKWARD INTERPOL.
-        );
-
-        // top left back
-        atomicAdd(
-            pimage + back_z_index + image_zdepth * (left_x_index + image_width * top_y_index),
-            (1 - x_lerp) * (z_lerp) * (1 - y_lerp) * grads_ptr[out_idx]   // THIS IS BACKWARD INTERPOL.
-        );
-
-        // top right front
-        atomicAdd(
-            pimage + front_z_index + image_zdepth * (right_x_index + image_width * top_y_index),
-            (x_lerp) * (1 - z_lerp) * (1 - y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-        // top right back
-        atomicAdd(
-            pimage + back_z_index + image_zdepth * (right_x_index + image_width * top_y_index),
-            (x_lerp) * (z_lerp) * (1 - y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-        // bottom left front
-        atomicAdd(
-            pimage + front_z_index + image_zdepth * (left_x_index + image_width * bottom_y_index),
-            (1 - x_lerp) * (1 - z_lerp) * (y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-        // bottom left back
-        atomicAdd(
-            pimage + back_z_index + image_zdepth * (left_x_index + image_width * bottom_y_index),
-            (1 - x_lerp) * (z_lerp) * (y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-        // bottom right front
-        atomicAdd(
-            pimage + front_z_index + image_zdepth * (right_x_index + image_width * bottom_y_index),
-            (x_lerp) * (1 - z_lerp) * (y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-        // bottom right back
-        atomicAdd(
-            pimage + back_z_index + image_zdepth * (right_x_index + image_width * bottom_y_index),
-            (x_lerp) * (z_lerp) * (y_lerp) * grads_ptr[out_idx]   // THIS IS backward INTERPOL.
-        );
-
-    }
-}
-
-
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream)
-{   
-    const int total_count = num_boxes * crop_height * crop_width * crop_zdepth * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, image_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width, image_zdepth,
-            crop_height, crop_width, crop_zdepth, depth, extrapolation_value, crops_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
-
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float *grads_image_ptr, cudaStream_t stream)
-{   
-    const int total_count = num_boxes * crop_height * crop_width * crop_zdepth * depth;
-    const int thread_per_block = 1024;
-    const int block_count = (total_count + thread_per_block - 1) / thread_per_block;
-    cudaError_t err;
-
-    if (total_count > 0)
-    {
-        CropAndResizeBackpropImageKernel<<<block_count, thread_per_block, 0, stream>>>(
-            total_count, grads_ptr, boxes_ptr,
-            box_ind_ptr, num_boxes, batch, image_height, image_width, image_zdepth,
-            crop_height, crop_width, crop_zdepth, depth, grads_image_ptr);
-
-        err = cudaGetLastError();
-        if (cudaSuccess != err)
-        {
-            fprintf(stderr, "cudaCheckError() failed in Roi Align : %s\n", cudaGetErrorString(err));
-            exit(-1);
-        }
-    }
-}
\ No newline at end of file
diff --git a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu.o b/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu.o
deleted file mode 100644
index d488598..0000000
Binary files a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.cu.o and /dev/null differ
diff --git a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.h b/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.h
deleted file mode 100644
index 9244582..0000000
--- a/cuda_functions/roi_align_3D/roi_align/src/cuda/crop_and_resize_kernel.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _CropAndResize_Kernel
-#define _CropAndResize_Kernel
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void CropAndResizeLaucher(
-    const float *image_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float extrapolation_value, float *crops_ptr, cudaStream_t stream);
-
-void CropAndResizeBackpropImageLaucher(
-    const float *grads_ptr, const float *boxes_ptr,
-    const int *box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int image_zdepth, int crop_height, int crop_width, int crop_zdepth, int depth,
-    float *grads_image_ptr, cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/exec.py b/exec.py
index c2da02b..f02d8ab 100644
--- a/exec.py
+++ b/exec.py
@@ -1,348 +1,350 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
 """ execution script. this where all routines come together and the only script you need to call.
     refer to parse args below to see options for execution.
 """
 
 import plotting as plg
 
 import os
 import warnings
 import argparse
 import time
 
 import torch
 
 import utils.exp_utils as utils
 from evaluator import Evaluator
 from predictor import Predictor
 
 
 for msg in ["Attempting to set identical bottom==top results",
             "This figure includes Axes that are not compatible with tight_layout",
             "Data has no positive values, and therefore cannot be log-scaled.",
             ".*invalid value encountered in true_divide.*"]:
     warnings.filterwarnings("ignore", msg)
 
 
 def train(cf, logger):
     """
     performs the training routine for a given fold. saves plots and selected parameters to the experiment dir
     specified in the configs.
     
     """
     logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format(
         cf.dim, cf.fold, cf.exp_dir, cf.model))
     logger.time("train_val")
 
     # -------------- inits and settings -----------------
     net = model.net(cf, logger).cuda()
     if cf.optimizer == "ADAM":
         optimizer = torch.optim.Adam(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay)
     elif cf.optimizer == "SGD":
         optimizer = torch.optim.SGD(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay, momentum=0.3)
     if cf.dynamic_lr_scheduling:
         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor,
                                                                     patience=cf.scheduling_patience)
     model_selector = utils.ModelSelector(cf, logger)
 
     starting_epoch = 1
     if cf.resume_from_checkpoint:
         starting_epoch = utils.load_checkpoint(cf.resume_from_checkpoint, net, optimizer)
         logger.info('resumed from checkpoint {} at epoch {}'.format(cf.resume_from_checkpoint, starting_epoch))
 
     # prepare monitoring
     monitor_metrics = utils.prepare_monitoring(cf)
 
     logger.info('loading dataset and initializing batch generators...')
     batch_gen = data_loader.get_train_generators(cf, logger)
 
     # -------------- training -----------------
     for epoch in range(starting_epoch, cf.num_epochs + 1):
 
         logger.info('starting training epoch {}/{}'.format(epoch, cf.num_epochs))
         logger.time("train_epoch")
 
         net.train()
 
         train_results_list = []
         train_evaluator = Evaluator(cf, logger, mode='train')
 
         for i in range(cf.num_train_batches):
             logger.time("train_batch_loadfw")
             batch = next(batch_gen['train'])
             batch_gen['train'].generator.stats['roi_counts'] += batch['roi_counts']
             batch_gen['train'].generator.stats['empty_samples_count'] += batch['empty_samples_count']
 
             logger.time("train_batch_loadfw")
             logger.time("train_batch_netfw")
             results_dict = net.train_forward(batch)
             logger.time("train_batch_netfw")
             logger.time("train_batch_bw")
             optimizer.zero_grad()
             results_dict['torch_loss'].backward()
             if cf.clip_norm:
                 torch.nn.utils.clip_grad_norm_(net.parameters(), cf.clip_norm, norm_type=2) #gradient clipping
             optimizer.step()
             train_results_list.append(({k:v for k,v in results_dict.items() if k != "seg_preds"}, batch["pid"])) #slim res dict
             if not cf.server_env:
                 print("\rFinished training batch " +
                       "{}/{} in {:.1f}s ({:.2f}/{:.2f} forw load/net, {:.2f} backw).".format(i+1, cf.num_train_batches,
                                                                                              logger.get_time("train_batch_loadfw")+
                                                                                              logger.get_time("train_batch_netfw")
                                                                                              +logger.time("train_batch_bw"),
                                                                                              logger.get_time("train_batch_loadfw",reset=True),
                                                                                              logger.get_time("train_batch_netfw", reset=True),
                                                                                              logger.get_time("train_batch_bw", reset=True)), end="", flush=True)
         print()
 
         #--------------- train eval ----------------
         if (epoch-1)%cf.plot_frequency==0:
             # view an example batch
+            logger.time("train_plot")
             plg.view_batch(cf, batch, results_dict, has_colorchannels=cf.has_colorchannels, show_gt_labels=True,
-                               out_file=os.path.join(cf.plot_dir, 'batch_example_train_{}.png'.format(cf.fold)))
+                           out_file=os.path.join(cf.plot_dir, 'batch_example_train_{}.png'.format(cf.fold)))
+            logger.info("generated train-example plot in {:.2f}s".format(logger.get_time("train_plot", reset=True)))
 
 
         logger.time("evals")
         _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(train_results_list, monitor_metrics['train'])
         #np_loss, torch_loss = train_loss_running_mean / cf.num_train_batches, monitor_metrics['train']["loss"][-1]
         #assert np_loss/torch_loss-1<0.005, "{} vs {}".format(np_loss, torch_loss)
         logger.time("evals")
         logger.time("train_epoch", toggle=False)
         del train_results_list
         #----------- validation ------------
         logger.info('starting validation in mode {}.'.format(cf.val_mode))
         logger.time("val_epoch")
         with torch.no_grad():
             net.eval()
             val_results_list = []
             val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)
             val_predictor = Predictor(cf, net, logger, mode='val')
 
             for i in range(batch_gen['n_val']):
                 logger.time("val_batch")
                 batch = next(batch_gen[cf.val_mode])
                 if cf.val_mode == 'val_patient':
                     results_dict = val_predictor.predict_patient(batch)
                 elif cf.val_mode == 'val_sampling':
                     results_dict = net.train_forward(batch, is_validation=True)
                 val_results_list.append([results_dict, batch["pid"]])
                 if not cf.server_env:
                     print("\rFinished validation {} {}/{} in {:.1f}s.".format('patient' if cf.val_mode=='val_patient' else 'batch',
                                                                               i + 1, batch_gen['n_val'],
                                                                               logger.time("val_batch")), end="", flush=True)
             print()
 
             #------------ val eval -------------
             logger.time("val_plot")
             if (epoch - 1) % cf.plot_frequency == 0:
                 plg.view_batch(cf, batch, results_dict, has_colorchannels=cf.has_colorchannels, show_gt_labels=True,
                                out_file=os.path.join(cf.plot_dir, 'batch_example_val_{}.png'.format(cf.fold)))
             logger.time("val_plot")
 
             logger.time("evals")
             _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(val_results_list, monitor_metrics['val'])
 
             model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch)
             del val_results_list
             #----------- monitoring -------------
             monitor_metrics.update({"lr": 
                 {str(g) : group['lr'] for (g, group) in enumerate(optimizer.param_groups)}})
             logger.metrics2tboard(monitor_metrics, global_step=epoch)
             logger.time("evals")
 
             logger.info('finished epoch {}/{}, took {:.2f}s. train total: {:.2f}s, average: {:.2f}s. val total: {:.2f}s, average: {:.2f}s.'.format(
                 epoch, cf.num_epochs, logger.get_time("train_epoch")+logger.time("val_epoch"), logger.get_time("train_epoch"),
                 logger.get_time("train_epoch", reset=True)/cf.num_train_batches, logger.get_time("val_epoch"),
                 logger.get_time("val_epoch", reset=True)/batch_gen["n_val"]))
             logger.info("time for evals: {:.2f}s, val plot {:.2f}s".format(logger.get_time("evals", reset=True), logger.get_time("val_plot", reset=True)))
 
         #-------------- scheduling -----------------
         if not cf.dynamic_lr_scheduling:
             for param_group in optimizer.param_groups:
                 param_group['lr'] = cf.learning_rate[epoch-1]
         else:
             scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
 
     logger.time("train_val")
     logger.info("Training and validating over {} epochs took {}".format(cf.num_epochs, logger.get_time("train_val", format="hms", reset=True)))
     batch_gen['train'].generator.print_stats(logger, plot=True)
 
 def test(cf, logger, max_fold=None):
     """performs testing for a given fold (or held out set). saves stats in evaluator.
     """
     logger.time("test_fold")
     logger.info('starting testing model of fold {} in exp {}'.format(cf.fold, cf.exp_dir))
     net = model.net(cf, logger).cuda()
     batch_gen = data_loader.get_test_generator(cf, logger)
 
     test_predictor = Predictor(cf, net, logger, mode='test')
     test_results_list = test_predictor.predict_test_set(batch_gen, return_results = not hasattr(
         cf, "eval_test_separately") or not cf.eval_test_separately)
 
     if test_results_list is not None:
         test_evaluator = Evaluator(cf, logger, mode='test')
         test_evaluator.evaluate_predictions(test_results_list)
         test_evaluator.score_test_df(max_fold=max_fold)
 
     mins, secs = divmod(logger.get_time("test_fold"), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
 
     logger.info('Testing of fold {} took {}.'.format(cf.fold, t))
 
 
 if __name__ == '__main__':
     stime = time.time()
 
     parser = argparse.ArgumentParser()
     parser.add_argument('-m', '--mode', type=str,  default='train_test', help='one out of: create_exp, analysis, train, train_test, or test')
     parser.add_argument('-f', '--folds', nargs='+', type=int, default=None, help='None runs over all folds in CV. otherwise specify list of folds.')
-    parser.add_argument('--exp_dir', type=str, default='/home/gregor/Documents/medicaldetectiontoolkit/datasets/prostate/experiments/dev',
+    parser.add_argument('--exp_dir', type=str, default='/home/gregor/Documents/regrcnn/datasets/toy/experiments/dev',
                         help='path to experiment dir. will be created if non existent.')
     parser.add_argument('--server_env', default=False, action='store_true', help='change IO settings to deploy models on a cluster.')
     parser.add_argument('--data_dest', type=str, default=None, help="path to final data folder if different from config")
     parser.add_argument('--use_stored_settings', default=False, action='store_true',
                         help='load configs from existing exp_dir instead of source dir. always done for testing, '
                              'but can be set to true to do the same for training. useful in job scheduler environment, '
                              'where source code might change before the job actually runs.')
     parser.add_argument('--resume_from_checkpoint', type=str, default=None,
                         help='path to checkpoint. if resuming from checkpoint, the desired fold still needs to be parsed via --folds.')
-    parser.add_argument('--dataset_name', type=str, default='prostate', help="path to the dataset-specific code in source_dir/datasets")
+    parser.add_argument('--dataset_name', type=str, default='toy', help="path to the dataset-specific code in source_dir/datasets")
     parser.add_argument('-d', '--dev', default=False, action='store_true', help="development mode: shorten everything")
 
     args = parser.parse_args()
     args.dataset_name = os.path.join("datasets", args.dataset_name) if not "datasets" in args.dataset_name else args.dataset_name
     folds = args.folds
     resume_from_checkpoint = None if args.resume_from_checkpoint in ['None', 'none'] else args.resume_from_checkpoint
 
     if args.mode == 'create_exp':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=False)
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         logger.info('created experiment directory at {}'.format(args.exp_dir))
 
     elif args.mode == 'train' or args.mode == 'train_test':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, args.use_stored_settings)
         if args.dev:
             folds = [0,1]
             cf.batch_size, cf.num_epochs, cf.min_save_thresh, cf.save_n_models = 3 if cf.dim==2 else 1, 1, 0, 1
-            cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 7, 1, 1
+            cf.num_train_batches, cf.num_val_batches, cf.max_val_patients = 5, 1, 1
             cf.test_n_epochs =  cf.save_n_models
             cf.max_test_patients = 1
             torch.backends.cudnn.benchmark = cf.dim==3
         else:
             torch.backends.cudnn.benchmark = cf.cuda_benchmark
         if args.data_dest is not None:
             cf.data_dest = args.data_dest
             
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         data_loader = utils.import_module('data_loader', os.path.join(args.dataset_name, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
         if folds is None:
             folds = range(cf.n_cv_splits)
 
         for fold in folds:
             """k-fold cross-validation: the dataset is split into k equally-sized folds, one used for validation,
             one for testing, the rest for training. This loop iterates k-times over the dataset, cyclically moving the
             splits. k==folds, fold in [0,folds) says which split is used for testing.
             """
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(fold))
             cf.fold, logger.fold = fold, fold
             cf.resume_from_checkpoint = resume_from_checkpoint
             if not os.path.exists(cf.fold_dir):
                 os.mkdir(cf.fold_dir)
             train(cf, logger)
             cf.resume_from_checkpoint = None
             if args.mode == 'train_test':
                 test(cf, logger)
 
     elif args.mode == 'test':
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=True, is_training=False)
         if args.data_dest is not None:
             cf.data_dest = args.data_dest
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
         data_loader = utils.import_module('data_loader', os.path.join(args.dataset_name, 'data_loader.py'))
         model = utils.import_module('model', cf.model_path)
         logger.info("loaded model from {}".format(cf.model_path))
 
         fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if
                      os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")])
         if folds is None:
             folds = range(cf.n_cv_splits)
         if args.dev:
             folds = folds[:2]
             cf.batch_size, cf.max_test_patients, cf.test_n_epochs = 1 if cf.dim==2 else 1, 2, 2
         else:
             torch.backends.cudnn.benchmark = cf.cuda_benchmark
         for fold in folds:
             cf.fold = fold
             cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(cf.fold))
             if cf.fold_dir in fold_dirs:
                 test(cf, logger, max_fold=max([int(f[-1]) for f in fold_dirs]))
             else:
                 logger.info("Skipping fold {} since no model parameters found.".format(fold))
     # load raw predictions saved by predictor during testing, run aggregation algorithms and evaluation.
     elif args.mode == 'analysis':
         """ analyse already saved predictions.
         """
         cf = utils.prep_exp(args.dataset_name, args.exp_dir, args.server_env, use_stored_settings=True, is_training=False)
         logger = utils.get_logger(cf.exp_dir, cf.server_env)
 
         if cf.held_out_test_set and not cf.eval_test_fold_wise:
             predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
             results_list = predictor.load_saved_predictions()
             logger.info('starting evaluation...')
             cf.fold = 0
             evaluator = Evaluator(cf, logger, mode='test')
             evaluator.evaluate_predictions(results_list)
             evaluator.score_test_df(max_fold=0)
         else:
             fold_dirs = sorted([os.path.join(cf.exp_dir, f) for f in os.listdir(cf.exp_dir) if
                          os.path.isdir(os.path.join(cf.exp_dir, f)) and f.startswith("fold")])
             if args.dev:
                 fold_dirs = fold_dirs[:1]
             if folds is None:
                 folds = range(cf.n_cv_splits)
             for fold in folds:
                 cf.fold = fold
                 cf.fold_dir = os.path.join(cf.exp_dir, 'fold_{}'.format(cf.fold))
 
                 if cf.fold_dir in fold_dirs:
                     predictor = Predictor(cf, net=None, logger=logger, mode='analysis')
                     results_list = predictor.load_saved_predictions()
                     # results_list[x][1] is pid, results_list[x][0] is list of len samples-per-patient, each entry hlds
                     # list of boxes per that sample, i.e., len(results_list[x][y][0]) would be nr of boxes in sample y of patient x
                     logger.info('starting evaluation...')
                     evaluator = Evaluator(cf, logger, mode='test')
                     evaluator.evaluate_predictions(results_list)
                     max_fold = max([int(f[-1]) for f in fold_dirs])
                     evaluator.score_test_df(max_fold=max_fold)
                 else:
                     logger.info("Skipping fold {} since no model parameters found.".format(fold))
     else:
         raise ValueError('mode "{}" specified in args is not implemented.'.format(args.mode))
         
     mins, secs = divmod((time.time() - stime), 60)
     h, mins = divmod(mins, 60)
     t = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(mins), int(secs))
     logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t))
     del logger
     torch.cuda.empty_cache()
 
 
 
diff --git a/setup.py b/setup.py
index 0a64bb7..c659911 100644
--- a/setup.py
+++ b/setup.py
@@ -1,33 +1,56 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
-from distutils.core import setup
-from setuptools import find_packages
+from setuptools import setup, find_packages
 
-req_file = "requirements.txt"
+import os, site
 
-def parse_requirements(filename):
+def parse_requirements(filename, exclude=[]):
     lineiter = (line.strip() for line in open(filename))
-    return [line for line in lineiter if line and not line.startswith("#")]
+    return [line for line in lineiter if line and not line.startswith("#") and not line.split("==")[0] in exclude]
+
+def install_custom_ext(setup_path):
+    os.system("python "+setup_path+" install")
+    return
+
+def clean():
+    """Custom clean command to tidy up the project root."""
+    os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
+
+req_file = "requirements.txt"
+custom_exts = ["nms-extension", "RoIAlign-extension-2D", "RoIAlign-extension-3D"]
 
-install_reqs = parse_requirements(req_file)
+install_reqs = parse_requirements(req_file, exclude=custom_exts)
 
-setup(name='model',
-      version='latest',
+setup(name='RegRCNN',
+      version='0.0.2',
       packages=find_packages(exclude=['test', 'test.*']),
       install_requires=install_reqs,
-      dependency_links=[],
-      )
\ No newline at end of file
+      dependency_links=[]
+      )
+
+# recognise newly installed packages in sys.path
+site.main()
+
+custom_exts =  ["custom_extensions/nms", "custom_extensions/roi_align"]
+for path in custom_exts:
+    setup_path = os.path.join(path, "setup.py")
+    try:
+        install_custom_ext(setup_path)
+    except Exception as e:
+        print("FAILED to install custom extension {} due to Error:\n{}".format(path, e))
+
+clean()
\ No newline at end of file
diff --git a/utils/dataloader_utils.py b/utils/dataloader_utils.py
index c838ee6..0d0ca4c 100644
--- a/utils/dataloader_utils.py
+++ b/utils/dataloader_utils.py
@@ -1,655 +1,652 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import plotting as plg
 
 import os
 from multiprocessing import Pool
 import pickle
 import warnings
 
 import numpy as np
 import pandas as pd
 from batchgenerators.transforms.abstract_transforms import AbstractTransform
 from scipy.ndimage.measurements import label as lb
 from torch.utils.data import Dataset as torchDataset
 from batchgenerators.dataloading.data_loader import SlimDataLoaderBase
 
 import utils.exp_utils as utils
 import data_manager as dmanager
 
 
 for msg in ["This figure includes Axes that are not compatible with tight_layout",
             "Data has no positive values, and therefore cannot be log-scaled."]:
     warnings.filterwarnings("ignore", msg)
 
 
 class AttributeDict(dict):
     __getattr__ = dict.__getitem__
     __setattr__ = dict.__setitem__
 
 ##################################
 #  data loading, organisation  #
 ##################################
 
 
 class fold_generator:
     """
     generates splits of indices for a given length of a dataset to perform n-fold cross-validation.
     splits each fold into 3 subsets for training, validation and testing.
     This form of cross validation uses an inner loop test set, which is useful if test scores shall be reported on a
     statistically reliable amount of patients, despite limited size of a dataset.
     If hold out test set is provided and hence no inner loop test set needed, just add test_idxs to the training data in the dataloader.
     This creates straight-forward train-val splits.
     :returns names list: list of len n_splits. each element is a list of len 3 for train_ix, val_ix, test_ix.
     """
     def __init__(self, seed, n_splits, len_data):
         """
         :param seed: Random seed for splits.
         :param n_splits: number of splits, e.g. 5 splits for 5-fold cross-validation
         :param len_data: number of elements in the dataset.
         """
         self.tr_ix = []
         self.val_ix = []
         self.te_ix = []
         self.slicer = None
         self.missing = 0
         self.fold = 0
         self.len_data = len_data
         self.n_splits = n_splits
         self.myseed = seed
         self.boost_val = 0
 
     def init_indices(self):
 
         t = list(np.arange(self.l))
         # round up to next splittable data amount.
         split_length = int(np.ceil(len(t) / float(self.n_splits)))
         self.slicer = split_length
         self.mod = len(t) % self.n_splits
         if self.mod > 0:
             # missing is the number of folds, in which the new splits are reduced to account for missing data.
             self.missing = self.n_splits - self.mod
 
         self.te_ix = t[:self.slicer]
         self.tr_ix = t[self.slicer:]
         self.val_ix = self.tr_ix[:self.slicer]
         self.tr_ix = self.tr_ix[self.slicer:]
 
     def new_fold(self):
 
         slicer = self.slicer
         if self.fold < self.missing :
             slicer = self.slicer - 1
 
         temp = self.te_ix
 
         # catch exception mod == 1: test set collects 1+ data since walk through both roudned up splits.
         # account for by reducing last fold split by 1.
         if self.fold == self.n_splits-2 and self.mod ==1:
             temp += self.val_ix[-1:]
             self.val_ix = self.val_ix[:-1]
 
         self.te_ix = self.val_ix
         self.val_ix = self.tr_ix[:slicer]
         self.tr_ix = self.tr_ix[slicer:] + temp
 
 
     def get_fold_names(self):
         names_list = []
         rgen = np.random.RandomState(self.myseed)
         cv_names = np.arange(self.len_data)
 
         rgen.shuffle(cv_names)
         self.l = len(cv_names)
         self.init_indices()
 
         for split in range(self.n_splits):
             train_names, val_names, test_names = cv_names[self.tr_ix], cv_names[self.val_ix], cv_names[self.te_ix]
             names_list.append([train_names, val_names, test_names, self.fold])
             self.new_fold()
             self.fold += 1
 
         return names_list
 
 
 
 class FoldGenerator():
     r"""takes a set of elements (identifiers) and randomly splits them into the specified amt of subsets.
     """
 
     def __init__(self, identifiers, seed, n_splits=5):
         self.ids = np.array(identifiers)
         self.n_splits = n_splits
         self.seed = seed
 
     def generate_splits(self, n_splits=None):
         if n_splits is None:
             n_splits = self.n_splits
 
         rgen = np.random.RandomState(self.seed)
         rgen.shuffle(self.ids)
         self.splits = list(np.array_split(self.ids, n_splits, axis=0))  # already returns list, but to be sure
         return self.splits
 
 
 class Dataset(torchDataset):
     r"""Parent Class for actual Dataset classes to inherit from!
     """
     def __init__(self, cf, data_sourcedir=None):
         super(Dataset, self).__init__()
         self.cf = cf
 
         self.data_sourcedir = cf.data_sourcedir if data_sourcedir is None else data_sourcedir
         self.data_dir = cf.data_dir if hasattr(cf, 'data_dir') else self.data_sourcedir
 
         self.data_dest = cf.data_dest if hasattr(cf, "data_dest") else self.data_sourcedir
 
         self.data = {}
         self.set_ids = []
 
     def copy_data(self, cf, file_subset, keep_packed=False, del_after_unpack=False):
         if os.path.normpath(self.data_sourcedir) != os.path.normpath(self.data_dest):
             self.data_sourcedir = os.path.join(self.data_sourcedir, '')
             args = AttributeDict({
                     "source" :  self.data_sourcedir,
                     "destination" : self.data_dest,
                     "recursive" : True,
                     "cp_only_npz" : False,
                     "keep_packed" : keep_packed,
                     "del_after_unpack" : del_after_unpack,
                     "threads" : 16 if self.cf.server_env else os.cpu_count()
                     })
             dmanager.copy(args, file_subset=file_subset)
             self.data_dir = self.data_dest
 
 
 
     def __len__(self):
         return len(self.data)
     def __getitem__(self, id):
         """Return a sample of the dataset, i.e.,the dict of the id
         """
         return self.data[id]
     def __iter__(self):
         return self.data.__iter__()
 
     def init_FoldGenerator(self, seed, n_splits):
         self.fg = FoldGenerator(self.set_ids, seed=seed, n_splits=n_splits)
 
     def generate_splits(self, check_file):
         if not os.path.exists(check_file):
             self.fg.generate_splits()
             with open(check_file, 'wb') as handle:
                 pickle.dump(self.fg.splits, handle)
         else:
             with open(check_file, 'rb') as handle:
                 self.fg.splits = pickle.load(handle)
 
     def calc_statistics(self, subsets=None, plot_dir=None, overall_stats=True):
 
         if self.df is None:
             self.df = pd.DataFrame()
             balance_t = self.cf.balance_target if hasattr(self.cf, "balance_target") else "class_targets"
             self.df._metadata.append(balance_t)
             if balance_t=="class_targets":
                 mapper = lambda cl_id: self.cf.class_id2label[cl_id]
                 labels = self.cf.class_id2label.values()
             elif balance_t=="rg_bin_targets":
                 mapper = lambda rg_bin: self.cf.bin_id2label[rg_bin]
                 labels = self.cf.bin_id2label.values()
             # elif balance_t=="regression_targets":
             #     # todo this wont work
             #     mapper = lambda rg_val: AttributeDict({"name":rg_val}) #self.cf.bin_id2label[self.cf.rg_val_to_bin_id(rg_val)]
             #     labels = self.cf.bin_id2label.values()
             elif balance_t=="lesion_gleasons":
                 mapper = lambda gs: self.cf.gs2label[gs]
                 labels = self.cf.gs2label.values()
             else:
                 mapper = lambda x: AttributeDict({"name":x})
                 labels = None
             for pid, subj_data in self.data.items():
                 unique_ts, counts = np.unique(subj_data[balance_t], return_counts=True)
                 self.df = self.df.append(pd.DataFrame({"pid": [pid],
                                                        **{mapper(unique_ts[i]).name: [counts[i]] for i in
                                                           range(len(unique_ts))}}), ignore_index=True, sort=True)
             self.df = self.df.fillna(0)
 
         if overall_stats:
             df = self.df.drop("pid", axis=1)
             df = df.reindex(sorted(df.columns), axis=1).astype('uint32')
             print("Overall dataset roi counts per target kind:"); print(df.sum())
         if subsets is not None:
             self.df["subset"] = np.nan
             self.df["display_order"] = np.nan
             for ix, (subset, pids) in enumerate(subsets.items()):
                 self.df.loc[self.df.pid.isin(pids), "subset"] = subset
                 self.df.loc[self.df.pid.isin(pids), "display_order"] = ix
             df = self.df.groupby("subset").agg("sum").drop("pid", axis=1, errors='ignore').astype('int64')
             df = df.sort_values(by=['display_order']).drop('display_order', axis=1)
             df = df.reindex(sorted(df.columns), axis=1)
 
             print("Fold {} dataset roi counts per target kind:".format(self.cf.fold)); print(df)
         if plot_dir is not None:
             os.makedirs(plot_dir, exist_ok=True)
             if subsets is not None:
                 plg.plot_fold_stats(self.cf, df, labels, os.path.join(plot_dir, "data_stats_fold_" + str(self.cf.fold))+".pdf")
             if overall_stats:
                 plg.plot_data_stats(self.cf, df, labels, os.path.join(plot_dir, 'data_stats_overall.pdf'))
 
         return df, labels
 
 
 def get_class_balanced_patients(all_pids, class_targets, batch_size, num_classes, random_ratio=0):
     '''
     samples towards equilibrium of classes (on basis of total RoI counts). for highly imbalanced dataset, this might be a too strong requirement.
     :param class_targets: dic holding {patient_specifier : ROI class targets}, list position of ROI target corresponds to respective seg label - 1
     :param batch_size:
     :param num_classes:
     :return:
     '''
     # assert len(all_pids)>=batch_size, "not enough eligible pids {} to form a single batch of size {}".format(len(all_pids), batch_size)
     class_counts = {k: 0 for k in range(1,num_classes+1)}
     not_picked = np.array(all_pids)
     batch_patients = np.empty((batch_size,), dtype=not_picked.dtype)
     rarest_class = np.random.randint(1,num_classes+1)
 
     for ix in range(batch_size):
         if len(not_picked) == 0:
             warnings.warn("Dataset too small to generate batch with unique samples; => recycling.")
             not_picked = np.array(all_pids)
 
         np.random.shuffle(not_picked) #this could actually go outside(above) the loop.
         pick = not_picked[0]
         for cand in not_picked:
             if np.count_nonzero(class_targets[cand] == rarest_class) > 0:
                 pick = cand
                 cand_rarest_class = np.argmin([np.count_nonzero(class_targets[cand] == cl) for cl in
                                                range(1,num_classes+1)])+1
                 # if current batch already bigger than the batch random ratio, then
                 # check that weakest class in this patient is not the weakest in current batch (since needs to be boosted)
                 # also that at least one roi of this patient belongs to weakest class. If True, keep patient, else keep looking.
                 if (cand_rarest_class != rarest_class and np.count_nonzero(class_targets[cand] == rarest_class) > 0) \
                         or ix < int(batch_size * random_ratio):
                     break
 
         for c in range(1,num_classes+1):
             class_counts[c] += np.count_nonzero(class_targets[pick] == c)
         if not ix < int(batch_size * random_ratio) and class_counts[rarest_class] == 0:  # means searched thru whole set without finding rarest class
             print("Class {} not represented in current dataset.".format(rarest_class))
         rarest_class = np.argmin(([class_counts[c] for c in range(1,num_classes+1)]))+1
         batch_patients[ix] = pick
         not_picked = not_picked[not_picked != pick]  # removes pick
 
     return batch_patients
 
 
 class BatchGenerator(SlimDataLoaderBase):
     """
     create the training/validation batch generator. Randomly sample batch_size patients
     from the data set, (draw a random slice if 2D), pad-crop them to equal sizes and merge to an array.
     :param data: data dictionary as provided by 'load_dataset'
     :param img_modalities: list of strings ['adc', 'b1500'] from config
     :param batch_size: number of patients to sample for the batch
     :param pre_crop_size: equal size for merging the patients to a single array (before the final random-crop in data aug.)
     :return dictionary containing the batch data / seg / pids as lists; the augmenter will later concatenate them into an array.
     """
 
     def __init__(self, cf, data, n_batches=None):
         super(BatchGenerator, self).__init__(data, cf.batch_size, n_batches)
         self.cf = cf
         self.plot_dir = os.path.join(self.cf.plot_dir, 'train_generator')
 
         self.dataset_length = len(self._data)
         self.dataset_pids = list(self._data.keys())
         self.eligible_pids = self.dataset_pids
 
         self.stats = {"roi_counts": np.zeros((self.cf.num_classes,), dtype='uint32'), "empty_samples_count": 0}
 
         if hasattr(cf, "balance_target"):
             # WARNING: "balance targets are only implemented for 1-d targets (or 1-component vectors)"
             self.balance_target = cf.balance_target
         else:
             self.balance_target = "class_targets"
         self.targets = {k:v[self.balance_target] for (k,v) in self._data.items()}
 
     def balance_target_distribution(self, plot=False):
         """
         :param all_pids:
         :param self.targets:  dic holding {patient_specifier : patient-wise-unique ROI targets}
         :return: probability distribution over all pids. draw without replace from this.
         """
         # get unique foreground targets per patient, assign -1 to an "empty" patient (has no foreground)
         patient_ts = [np.unique(lst) if len([t for t in lst if np.any(t>0)])>0 else [-1] for lst in self.targets.values()]
         #bg_mask = np.array([np.all(lst == [-1]) for lst in patient_ts])
         unique_ts, t_counts = np.unique([t for lst in patient_ts for t in lst if t!=-1], return_counts=True)
         t_probs = t_counts.sum() / t_counts
         t_probs /= t_probs.sum()
         t_probs = {t : t_probs[ix] for ix, t in enumerate(unique_ts)}
         t_probs[-1] = 0.
         # fail if balance target is not a number (i.e., a vector)
         self.p_probs = np.array([ max([t_probs[t] for t in lst]) for lst in patient_ts ])
         #normalize
         self.p_probs /= self.p_probs.sum()
         # rescale probs of empty samples
         # if not 0 == self.p_probs[bg_mask].shape[0]:
         #     #rescale_f = (1 - self.cf.empty_samples_ratio) / self.p_probs[~bg_mask].sum()
         #     rescale_f = 1 / self.p_probs[~bg_mask].sum()
         #     self.p_probs *= rescale_f
         #     self.p_probs[bg_mask] = 0. #self.cf.empty_samples_ratio/self.p_probs[bg_mask].shape[0]
 
         self.unique_ts = unique_ts
 
         if plot:
             os.makedirs(self.plot_dir, exist_ok=True)
             plg.plot_batchgen_distribution(self.cf, self.dataset_pids, self.p_probs, self.balance_target,
                                            out_file=os.path.join(self.plot_dir,
                                                                  "train_gen_distr_"+str(self.cf.fold)+".png"))
         return self.p_probs
 
 
     def generate_train_batch(self):
         # to be overriden by child
         # everything done in here is per batch
         # print statements in here get confusing due to multithreading
 
         return
 
     def print_stats(self, logger=None, file=None, plot_file=None, plot=True):
         print_f = utils.CombinedPrinter(logger, file)
 
-        print_f('\nFinal Training Stats\n')
-        print_f('*********************\n')
+        print_f('\n***Final Training Stats***')
         total_count = np.sum(self.stats['roi_counts'])
         for tix, count in enumerate(self.stats['roi_counts']):
             #name = self.cf.class_dict[tix] if self.balance_target=="class_targets" else str(self.unique_ts[tix])
             name=str(self.unique_ts[tix])
-            print_f('{}: {} rois seen ({:.1f}%).\n'.format(name, count, count / total_count * 100))
+            print_f('{}: {} rois seen ({:.1f}%).'.format(name, count, count / total_count * 100))
         total_samples = self.cf.num_epochs*self.cf.num_train_batches*self.cf.batch_size
         print_f('empty samples seen: {} ({:.1f}%).\n'.format(self.stats['empty_samples_count'],
                                                          self.stats['empty_samples_count']/total_samples*100))
         if plot:
             if plot_file is None:
                 plot_file = os.path.join(self.plot_dir, "train_gen_stats_{}.png".format(self.cf.fold))
                 os.makedirs(self.plot_dir, exist_ok=True)
             plg.plot_batchgen_stats(self.cf, self.stats, self.balance_target, self.unique_ts, plot_file)
 
 class PatientBatchIterator(SlimDataLoaderBase):
     """
     creates a val/test generator. Step through the dataset and return dictionaries per patient.
     2D is a special case of 3D patching with patch_size[2] == 1 (slices)
     Creates whole Patient batch and targets, and - if necessary - patchwise batch and targets.
     Appends patient targets anyway for evaluation.
     For Patching, shifts all patches into batch dimension. batch_tiling_forward will take care of exceeding batch dimensions.
 
     This iterator/these batches are not intended to go through MTaugmenter afterwards
     """
 
     def __init__(self, cf, data):
         super(PatientBatchIterator, self).__init__(data, 0)
         self.cf = cf
 
         self.dataset_length = len(self._data)
         self.dataset_pids = list(self._data.keys())
 
     def generate_train_batch(self, pid=None):
         # to be overriden by child
 
         return
 
 ###################################
 #  transforms, image manipulation #
 ###################################
 
 def get_patch_crop_coords(img, patch_size, min_overlap=30):
     """
     _:param img (y, x, (z))
     _:param patch_size: list of len 2 (2D) or 3 (3D).
     _:param min_overlap: minimum required overlap of patches.
     If too small, some areas are poorly represented only at edges of single patches.
     _:return ndarray: shape (n_patches, 2*dim). crop coordinates for each patch.
     """
     crop_coords = []
     for dim in range(len(img.shape)):
         n_patches = int(np.ceil(img.shape[dim] / patch_size[dim]))
 
         # no crops required in this dimension, add image shape as coordinates.
         if n_patches == 1:
             crop_coords.append([(0, img.shape[dim])])
             continue
 
         # fix the two outside patches to coords patchsize/2 and interpolate.
         center_dists = (img.shape[dim] - patch_size[dim]) / (n_patches - 1)
 
         if (patch_size[dim] - center_dists) < min_overlap:
             n_patches += 1
             center_dists = (img.shape[dim] - patch_size[dim]) / (n_patches - 1)
 
         patch_centers = np.round([(patch_size[dim] / 2 + (center_dists * ii)) for ii in range(n_patches)])
         dim_crop_coords = [(center - patch_size[dim] / 2, center + patch_size[dim] / 2) for center in patch_centers]
         crop_coords.append(dim_crop_coords)
 
     coords_mesh_grid = []
     for ymin, ymax in crop_coords[0]:
         for xmin, xmax in crop_coords[1]:
             if len(crop_coords) == 3 and patch_size[2] > 1:
                 for zmin, zmax in crop_coords[2]:
                     coords_mesh_grid.append([ymin, ymax, xmin, xmax, zmin, zmax])
             elif len(crop_coords) == 3 and patch_size[2] == 1:
                 for zmin in range(img.shape[2]):
                     coords_mesh_grid.append([ymin, ymax, xmin, xmax, zmin, zmin + 1])
             else:
                 coords_mesh_grid.append([ymin, ymax, xmin, xmax])
     return np.array(coords_mesh_grid).astype(int)
 
-
-
 def pad_nd_image(image, new_shape=None, mode="edge", kwargs=None, return_slicer=False, shape_must_be_divisible_by=None):
     """
     one padder to pad them all. Documentation? Well okay. A little bit. by Fabian Isensee
 
     :param image: nd image. can be anything
     :param new_shape: what shape do you want? new_shape does not have to have the same dimensionality as image. If
     len(new_shape) < len(image.shape) then the last axes of image will be padded. If new_shape < image.shape in any of
     the axes then we will not pad that axis, but also not crop! (interpret new_shape as new_min_shape)
     Example:
     image.shape = (10, 1, 512, 512); new_shape = (768, 768) -> result: (10, 1, 768, 768). Cool, huh?
     image.shape = (10, 1, 512, 512); new_shape = (364, 768) -> result: (10, 1, 512, 768).
 
     :param mode: see np.pad for documentation
     :param return_slicer: if True then this function will also return what coords you will need to use when cropping back
     to original shape
     :param shape_must_be_divisible_by: for network prediction. After applying new_shape, make sure the new shape is
     divisibly by that number (can also be a list with an entry for each axis). Whatever is missing to match that will
     be padded (so the result may be larger than new_shape if shape_must_be_divisible_by is not None)
     :param kwargs: see np.pad for documentation
     """
     if kwargs is None:
         kwargs = {}
 
     if new_shape is not None:
         old_shape = np.array(image.shape[-len(new_shape):])
     else:
         assert shape_must_be_divisible_by is not None
         assert isinstance(shape_must_be_divisible_by, (list, tuple, np.ndarray))
         new_shape = image.shape[-len(shape_must_be_divisible_by):]
         old_shape = new_shape
 
     num_axes_nopad = len(image.shape) - len(new_shape)
 
     new_shape = [max(new_shape[i], old_shape[i]) for i in range(len(new_shape))]
 
     if not isinstance(new_shape, np.ndarray):
         new_shape = np.array(new_shape)
 
     if shape_must_be_divisible_by is not None:
         if not isinstance(shape_must_be_divisible_by, (list, tuple, np.ndarray)):
             shape_must_be_divisible_by = [shape_must_be_divisible_by] * len(new_shape)
         else:
             assert len(shape_must_be_divisible_by) == len(new_shape)
 
         for i in range(len(new_shape)):
             if new_shape[i] % shape_must_be_divisible_by[i] == 0:
                 new_shape[i] -= shape_must_be_divisible_by[i]
 
         new_shape = np.array([new_shape[i] + shape_must_be_divisible_by[i] - new_shape[i] % shape_must_be_divisible_by[i] for i in range(len(new_shape))])
 
     difference = new_shape - old_shape
     pad_below = difference // 2
     pad_above = difference // 2 + difference % 2
     pad_list = [[0, 0]]*num_axes_nopad + list([list(i) for i in zip(pad_below, pad_above)])
     res = np.pad(image, pad_list, mode, **kwargs)
     if not return_slicer:
         return res
     else:
         pad_list = np.array(pad_list)
         pad_list[:, 1] = np.array(res.shape) - pad_list[:, 1]
         slicer = list(slice(*i) for i in pad_list)
         return res, slicer
 
 def convert_seg_to_bounding_box_coordinates(data_dict, dim, roi_item_keys, get_rois_from_seg=False,
                                                 class_specific_seg=False):
     '''adapted from batchgenerators
 
     :param data_dict: seg: segmentation with labels indicating roi_count (get_rois_from_seg=False) or classes (get_rois_from_seg=True),
         class_targets: list where list index corresponds to roi id (roi_count)
     :param dim:
     :param roi_item_keys: keys of the roi-wise items in data_dict to process
     :param n_rg_feats: nr of regression vector features
     :param get_rois_from_seg:
     :return: coords (y1,x1,y2,x2 (,z1,z2))
     '''
 
     bb_target = []
     roi_masks = []
     roi_items = {name:[] for name in roi_item_keys}
     out_seg = np.copy(data_dict['seg'])
     for b in range(data_dict['seg'].shape[0]):
 
         p_coords_list = [] #p for patient?
         p_roi_masks_list = []
         p_roi_items_lists = {name:[] for name in roi_item_keys}
 
         if np.sum(data_dict['seg'][b] != 0) > 0:
             if get_rois_from_seg:
                 clusters, n_cands = lb(data_dict['seg'][b])
                 data_dict['class_targets'][b] = [data_dict['class_targets'][b]] * n_cands
             else:
                 n_cands = int(np.max(data_dict['seg'][b]))
 
             rois = np.array(
                 [(data_dict['seg'][b] == ii) * 1 for ii in range(1, n_cands + 1)], dtype='uint8')  # separate clusters
 
             for rix, r in enumerate(rois):
                 if np.sum(r != 0) > 0:  # check if the roi survived slicing (3D->2D) and data augmentation (cropping etc.)
                     seg_ixs = np.argwhere(r != 0)
                     coord_list = [np.min(seg_ixs[:, 1]) - 1, np.min(seg_ixs[:, 2]) - 1, np.max(seg_ixs[:, 1]) + 1,
                                   np.max(seg_ixs[:, 2]) + 1]
                     if dim == 3:
                         coord_list.extend([np.min(seg_ixs[:, 3]) - 1, np.max(seg_ixs[:, 3]) + 1])
 
                     p_coords_list.append(coord_list)
                     p_roi_masks_list.append(r)
                     # add background class = 0. rix is a patient wide index of lesions. since 'class_targets' is
                     # also patient wide, this assignment is not dependent on patch occurrences.
                     for name in roi_item_keys:
                         # if name == "class_targets":
                         #     # add background class = 0. rix is a patient-wide index of lesions. since 'class_targets' is
                         #     # also patient wide, this assignment is not dependent on patch occurrences.
                         #     p_roi_items_lists[name].append(data_dict[name][b][rix]+1)
                         # else:
                         p_roi_items_lists[name].append(data_dict[name][b][rix])
 
                     assert data_dict["class_targets"][b][rix]>=1, "convertsegtobbox produced bg roi w cl targ {} and unique roi seg {}".format(data_dict["class_targets"][b][rix], np.unique(r))
 
 
                 if class_specific_seg:
                     out_seg[b][data_dict['seg'][b] == rix + 1] = data_dict['class_targets'][b][rix] #+ 1
 
             if not class_specific_seg:
                 out_seg[b][data_dict['seg'][b] > 0] = 1
 
             bb_target.append(np.array(p_coords_list))
             roi_masks.append(np.array(p_roi_masks_list))
             for name in roi_item_keys:
                 roi_items[name].append(np.array(p_roi_items_lists[name]))
 
 
         else:
             bb_target.append([])
             roi_masks.append(np.zeros_like(data_dict['seg'][b], dtype='uint8')[None])
             for name in roi_item_keys:
                 roi_items[name].append(np.array([]))
 
     if get_rois_from_seg:
         data_dict.pop('class_targets', None)
 
     data_dict['bb_target'] = np.array(bb_target)
     data_dict['roi_masks'] = np.array(roi_masks)
     data_dict['seg'] = out_seg
     for name in roi_item_keys:
         data_dict[name] = np.array(roi_items[name])
 
 
     return data_dict
 
 class ConvertSegToBoundingBoxCoordinates(AbstractTransform):
     """ Converts segmentation masks into bounding box coordinates.
     """
 
     def __init__(self, dim, roi_item_keys, get_rois_from_seg=False, class_specific_seg=False):
         self.dim = dim
         self.roi_item_keys = roi_item_keys
         self.get_rois_from_seg = get_rois_from_seg
         self.class_specific_seg = class_specific_seg
 
     def __call__(self, **data_dict):
         return convert_seg_to_bounding_box_coordinates(data_dict, self.dim, self.roi_item_keys, self.get_rois_from_seg,
                                                        self.class_specific_seg)
 
 
 
 
 
 #############################
 #  data packing / unpacking # not used, data_manager.py used instead
 #############################
 
 def get_case_identifiers(folder):
     case_identifiers = [i[:-4] for i in os.listdir(folder) if i.endswith("npz")]
     return case_identifiers
 
 
 def convert_to_npy(npz_file):
     if not os.path.isfile(npz_file[:-3] + "npy"):
         a = np.load(npz_file)['data']
         np.save(npz_file[:-3] + "npy", a)
 
 
 def unpack_dataset(folder, threads=8):
     case_identifiers = get_case_identifiers(folder)
     p = Pool(threads)
     npz_files = [os.path.join(folder, i + ".npz") for i in case_identifiers]
     p.map(convert_to_npy, npz_files)
     p.close()
     p.join()
 
 
 def delete_npy(folder):
     case_identifiers = get_case_identifiers(folder)
     npy_files = [os.path.join(folder, i + ".npy") for i in case_identifiers]
     npy_files = [i for i in npy_files if os.path.isfile(i)]
     for n in npy_files:
         os.remove(n)
\ No newline at end of file
diff --git a/utils/exp_utils.py b/utils/exp_utils.py
index 26a3485..68de0d3 100644
--- a/utils/exp_utils.py
+++ b/utils/exp_utils.py
@@ -1,630 +1,629 @@
 #!/usr/bin/env python
 # Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 #import plotting as plg
 
 import sys
 import os
 import subprocess
 import threading
 import pickle
 import importlib.util
 import psutil
-from functools import partial
 import time
 
 import logging
-from tensorboardX import SummaryWriter
+from torch.utils.tensorboard import SummaryWriter
 
 from collections import OrderedDict
 import numpy as np
 import pandas as pd
 import torch
 
 
 def import_module(name, path):
     """
     correct way of importing a module dynamically in python 3.
     :param name: name given to module instance.
     :param path: path to module.
     :return: module: returned module instance.
     """
     spec = importlib.util.spec_from_file_location(name, path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
 
 def save_obj(obj, name):
     """Pickle a python object."""
     with open(name + '.pkl', 'wb') as f:
         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
 
 def load_obj(file_path):
     with open(file_path, 'rb') as handle:
         return pickle.load(handle)
 
 def IO_safe(func, *args, _tries=5, _raise=True, **kwargs):
     """ Wrapper calling function func with arguments args and keyword arguments kwargs to catch input/output errors
         on cluster.
     :param func: function to execute (intended to be read/write operation to a problematic cluster drive, but can be
         any function).
     :param args: positional args of func.
     :param kwargs: kw args of func.
     :param _tries: how many attempts to make executing func.
     """
     for _try in range(_tries):
         try:
             return func(*args, **kwargs)
         except OSError as e:  # to catch cluster issues with network drives
             if _raise:
                 raise e
             else:
                 print("After attempting execution {} time{}, following error occurred:\n{}".format(_try+1,"" if _try==0 else "s", e))
                 continue
 
 
 def query_nvidia_gpu(device_id, d_keyword=None, no_units=False):
     """
     :param device_id:
     :param d_keyword: -d, --display argument (keyword(s) for selective display), all are selected if None
     :return: dict of gpu-info items
     """
     cmd = ['nvidia-smi', '-i', str(device_id), '-q']
     if d_keyword is not None:
         cmd += ['-d', d_keyword]
     outp = subprocess.check_output(cmd).strip().decode('utf-8').split("\n")
     outp = [x for x in outp if len(x)>0]
     headers = [ix for ix, item in enumerate(outp) if len(item.split(":"))==1] + [len(outp)]
 
     out_dict = {}
     for lix, hix in enumerate(headers[:-1]):
         head = outp[hix].strip().replace(" ", "_").lower()
         out_dict[head] = {}
         for lix2 in range(hix, headers[lix+1]):
             try:
                 key, val = [x.strip().lower() for x in outp[lix2].split(":")]
                 if no_units:
                     val = val.split()[0]
                 out_dict[head][key] = val
             except:
                 pass
 
     return out_dict
 
 class CombinedPrinter(object):
     """combined print function.
     prints to logger and/or file if given, to normal print if non given.
 
     """
     def __init__(self, logger=None, file=None):
 
         if logger is None and file is None:
             self.out = [print]
         elif logger is None:
             self.out = [file.write]
         elif file is None:
             self.out = [logger.info]
         else:
             self.out = [logger.info, file.write]
 
     def __call__(self, string):
         for fct in self.out:
             fct(string)
 
 class Nvidia_GPU_Logger(object):
     def __init__(self):
         self.count = None
 
     def get_vals(self):
 
         cmd = ['nvidia-settings', '-t', '-q', 'GPUUtilization']
         gpu_util = subprocess.check_output(cmd).strip().decode('utf-8').split(",")
         gpu_util = dict([f.strip().split("=") for f in gpu_util])
         cmd[-1] = 'UsedDedicatedGPUMemory'
         gpu_used_mem = subprocess.check_output(cmd).strip().decode('utf-8')
         current_vals = {"gpu_mem_alloc": gpu_used_mem, "gpu_graphics_util": int(gpu_util['graphics']),
                              "gpu_mem_util": gpu_util['memory'], "time": time.time()}
         return current_vals
 
     def loop(self):
         i = 0
         while True:
             self.get_vals()
             self.log["time"].append(time.time())
             self.log["gpu_util"].append(self.current_vals["gpu_graphics_util"])
             if self.count != None:
                 i += 1
                 if i == count:
                     exit(0)
             time.sleep(self.interval)
 
     def start(self, interval=1.):
         self.interval = interval
         self.start_time = time.time()
         self.log = {"time": [], "gpu_util": []}
         if self.interval is not None:
             thread = threading.Thread(target=self.loop)
             thread.daemon = True
             thread.start()
 
 class CombinedLogger(object):
     """Combine console and tensorboard logger and record system metrics.
     """
     def __init__(self, name, log_dir, server_env=True, fold="", sysmetrics_interval=2):
         self.pylogger = logging.getLogger(name)
         self.tboard = SummaryWriter(log_dir=log_dir)
         self.times = {}
         self.fold = fold
         # monitor system metrics (cpu, mem, ...)
         if not server_env:
             self.sysmetrics = pd.DataFrame(columns=["global_step", "rel_time", r"CPU (%)", "mem_used (GB)", r"mem_used (%)",
                                                     r"swap_used (GB)", r"gpu_utilization (%)"], dtype="float16")
             for device in range(torch.cuda.device_count()):
                 self.sysmetrics["mem_allocd (GB) by torch on {:10s}".format(torch.cuda.get_device_name(device))] = np.nan
                 self.sysmetrics["mem_cached (GB) by torch on {:10s}".format(torch.cuda.get_device_name(device))] = np.nan
             self.sysmetrics_start(sysmetrics_interval)
 
     def __getattr__(self, attr):
         """delegate all undefined method requests to objects of
         this class in order pylogger, tboard (first find first serve).
         E.g., combinedlogger.add_scalars(...) should trigger self.tboard.add_scalars(...)
         """
         for obj in [self.pylogger, self.tboard]:
             if attr in dir(obj):
                 return getattr(obj, attr)
         raise AttributeError("CombinedLogger has no attribute {}".format(attr))
 
 
     def time(self, name, toggle=None):
         """record time-spans as with a stopwatch.
         :param name:
         :param toggle: True^=On: start time recording, False^=Off: halt rec. if None determine from current status.
         :return: either start-time or last recorded interval
         """
         if toggle is None:
             if name in self.times.keys():
                 toggle = not self.times[name]["toggle"]
             else:
                 toggle = True
 
         if toggle:
             if not name in self.times.keys():
                 self.times[name] = {"total": 0, "last":0}
             elif self.times[name]["toggle"] == toggle:
                 print("restarting running stopwatch")
             self.times[name]["last"] = time.time()
             self.times[name]["toggle"] = toggle
             return time.time()
         else:
             if toggle == self.times[name]["toggle"]:
                 self.info("WARNING: tried to stop stopped stop watch: {}.".format(name))
             self.times[name]["last"] = time.time()-self.times[name]["last"]
             self.times[name]["total"] += self.times[name]["last"]
             self.times[name]["toggle"] = toggle
             return self.times[name]["last"]
 
     def get_time(self, name=None, kind="total", format=None, reset=False):
         """
         :param name:
         :param kind: 'total' or 'last'
         :param format: None for float, "hms"/"ms" for (hours), mins, secs as string
         :param reset: reset time after retrieving
         :return:
         """
         if name is None:
             times = self.times
             if reset:
                 self.reset_time()
             return times
 
         else:
             time = self.times[name][kind]
             if format == "hms":
                 m, s = divmod(time, 60)
                 h, m = divmod(m, 60)
                 time = "{:d}h:{:02d}m:{:02d}s".format(int(h), int(m), int(s))
             elif format == "ms":
                 m, s = divmod(time, 60)
                 time = "{:02d}m:{:02d}s".format(int(m), int(s))
             if reset:
                 self.reset_time(name)
             return time
 
     def reset_time(self, name=None):
         if name is None:
             self.times = {}
         else:
             del self.times[name]
 
 
     def sysmetrics_update(self, global_step=None):
         if global_step is None:
             global_step = time.strftime("%x_%X")
         mem = psutil.virtual_memory()     
         mem_used = (mem.total-mem.available)
         gpu_vals = self.gpu_logger.get_vals()
         rel_time = time.time()-self.sysmetrics_start_time
         self.sysmetrics.loc[len(self.sysmetrics)] = [global_step, rel_time,
                             psutil.cpu_percent(), mem_used/1024**3, mem_used/mem.total*100,
                             psutil.swap_memory().used/1024**3, int(gpu_vals['gpu_graphics_util']),
                             *[torch.cuda.memory_allocated(d)/1024**3 for d in range(torch.cuda.device_count())],
                             *[torch.cuda.memory_cached(d)/1024**3 for d in range(torch.cuda.device_count())]
                             ]
         return self.sysmetrics.loc[len(self.sysmetrics)-1].to_dict()
 
     def sysmetrics2tboard(self, metrics=None, global_step=None, suptitle=None):
         tag = "per_time"
         if metrics is None:
             metrics = self.sysmetrics_update(global_step=global_step)
             tag = "per_epoch"
 
         if suptitle is not None:
             suptitle = str(suptitle)
         elif self.fold!="":
             suptitle = "Fold_"+str(self.fold)
         if suptitle is not None:
             self.tboard.add_scalars(suptitle+"/System_Metrics/"+tag, {k:v for (k,v) in metrics.items() if (k!="global_step"
                                                         and k!="rel_time")}, global_step)
 
     def sysmetrics_loop(self):
         try:
             os.nice(-19)
         except:
             print("System-metrics logging has no superior process priority.")
         while True:
             metrics = self.sysmetrics_update()
             self.sysmetrics2tboard(metrics, global_step=metrics["rel_time"])
             #print("thread alive", self.thread.is_alive())
             time.sleep(self.sysmetrics_interval)
             
     def sysmetrics_start(self, interval):
         if interval is not None:
             self.sysmetrics_interval = interval
             self.gpu_logger = Nvidia_GPU_Logger()
             self.sysmetrics_start_time = time.time()
             self.thread = threading.Thread(target=self.sysmetrics_loop)
             self.thread.daemon = True
             self.thread.start()
 
     def sysmetrics_save(self, out_file):
 
         self.sysmetrics.to_pickle(out_file)
 
 
     def metrics2tboard(self, metrics, global_step=None, suptitle=None):
         """
         :param metrics: {'train': dataframe, 'val':df}, df as produced in
             evaluator.py.evaluate_predictions
         """
         #print("metrics", metrics)
         if global_step is None:
             global_step = len(metrics['train'][list(metrics['train'].keys())[0]])-1
         if suptitle is not None:
             suptitle = str(suptitle)
         else:
             suptitle = "Fold_"+str(self.fold)
 
         for key in ['train', 'val']:
             #series = {k:np.array(v[-1]) for (k,v) in metrics[key].items() if not np.isnan(v[-1]) and not 'Bin_Stats' in k}
             loss_series = {}
             unc_series = {}
             bin_stat_series = {}
             mon_met_series = {}
             for tag,val in metrics[key].items():
                 val = val[-1] #maybe remove list wrapping, recording in evaluator?
                 if 'bin_stats' in tag.lower() and not np.isnan(val):
                     bin_stat_series["{}".format(tag.split("/")[-1])] = val
                 elif 'uncertainty' in tag.lower() and not np.isnan(val):
                     unc_series["{}".format(tag)] = val
                 elif 'loss' in tag.lower() and not np.isnan(val):
                     loss_series["{}".format(tag)] = val
                 elif not np.isnan(val):
                     mon_met_series["{}".format(tag)] = val
 
             self.tboard.add_scalars(suptitle+"/Binary_Statistics/{}".format(key), bin_stat_series, global_step)
             self.tboard.add_scalars(suptitle + "/Uncertainties/{}".format(key), unc_series, global_step)
             self.tboard.add_scalars(suptitle + "/Losses/{}".format(key), loss_series, global_step)
             self.tboard.add_scalars(suptitle+"/Monitor_Metrics/{}".format(key), mon_met_series, global_step)
         self.tboard.add_scalars(suptitle + "/Learning_Rate", metrics["lr"], global_step)
         return
       
     def batchImgs2tboard(self, batch, results_dict, cmap, boxtype2color, img_bg=False, global_step=None):
         raise NotImplementedError("not up-to-date, problem with importing plotting-file, torchvision dependency.")
         if len(batch["seg"].shape)==5: #3D imgs
             slice_ix = np.random.randint(batch["seg"].shape[-1])
             seg_gt = plg.to_rgb(batch['seg'][:,0,:,:,slice_ix], cmap)
             seg_pred = plg.to_rgb(results_dict['seg_preds'][:,0,:,:,slice_ix], cmap)
             
             mod_img = plg.mod_to_rgb(batch["data"][:,0,:,:,slice_ix]) if img_bg else None
             
         elif len(batch["seg"].shape)==4:
             seg_gt = plg.to_rgb(batch['seg'][:,0,:,:], cmap)
             seg_pred = plg.to_rgb(results_dict['seg_preds'][:,0,:,:], cmap)
             mod_img = plg.mod_to_rgb(batch["data"][:,0]) if img_bg else None
         else:
             raise Exception("batch content has wrong format: {}".format(batch["seg"].shape))
         
         #from here on only works in 2D
         seg_gt = np.transpose(seg_gt, axes=(0,3,1,2)) #previous shp: b,x,y,c
         seg_pred = np.transpose(seg_pred, axes=(0,3,1,2))
         
         
         seg = np.concatenate((seg_gt, seg_pred), axis=0)
         # todo replace torchvision (tv) dependency
         seg = tv.utils.make_grid(torch.from_numpy(seg), nrow=2)
         self.tboard.add_image("Batch seg, 1st col: gt, 2nd: pred.", seg, global_step=global_step)      
         
         if img_bg:
             bg_img  = np.transpose(mod_img, axes=(0,3,1,2))
         else:
             bg_img = seg_gt
         box_imgs = plg.draw_boxes_into_batch(bg_img, results_dict["boxes"], boxtype2color)
         box_imgs = tv.utils.make_grid(torch.from_numpy(box_imgs), nrow=4)
         self.tboard.add_image("Batch bboxes", box_imgs, global_step=global_step)
         
         return
 
     def __del__(self): # otherwise might produce multiple prints e.g. in ipython console
         for hdlr in self.pylogger.handlers:
             hdlr.close()
         self.tboard.close()
         self.pylogger.handlers = []
         del self.pylogger
 
 def get_logger(exp_dir, server_env=False, sysmetrics_interval=2):
     log_dir = os.path.join(exp_dir, "logs")
     logger = CombinedLogger('medical_detection',  os.path.join(log_dir, "tboard"), server_env=server_env,
                             sysmetrics_interval=sysmetrics_interval)
     logger.setLevel(logging.DEBUG)
     log_file = os.path.join(log_dir, 'exec.log')
 
     logger.addHandler(logging.FileHandler(log_file))
     if not server_env:
         logger.addHandler(ColorHandler())
     else:
         logger.addHandler(logging.StreamHandler())
     logger.pylogger.propagate = False
     print('Logging to {}'.format(log_file))
 
     return logger
 
 def prep_exp(dataset_path, exp_path, server_env, use_stored_settings=True, is_training=True):
     """
     I/O handling, creating of experiment folder structure. Also creates a snapshot of configs/model scripts and copies them to the exp_dir.
     This way the exp_dir contains all info needed to conduct an experiment, independent to changes in actual source code. Thus, training/inference of this experiment can be started at anytime.
     Therefore, the model script is copied back to the source code dir as tmp_model (tmp_backbone).
     Provides robust structure for cloud deployment.
     :param dataset_path: path to source code for specific data set. (e.g. medicaldetectiontoolkit/lidc_exp)
     :param exp_path: path to experiment directory.
     :param server_env: boolean flag. pass to configs script for cloud deployment.
     :param use_stored_settings: boolean flag. When starting training: If True, starts training from snapshot in existing
         experiment directory, else creates experiment directory on the fly using configs/model scripts from source code.
     :param is_training: boolean flag. distinguishes train vs. inference mode.
     :return: configs object.
     """
 
     if is_training:
 
         if use_stored_settings:
             cf_file = import_module('cf', os.path.join(exp_path, 'configs.py'))
             cf = cf_file.Configs(server_env)
             # in this mode, previously saved model and backbone need to be found in exp dir.
             if not os.path.isfile(os.path.join(exp_path, 'model.py')) or \
                     not os.path.isfile(os.path.join(exp_path, 'backbone.py')):
                 raise Exception("Selected use_stored_settings option but no model and/or backbone source files exist in exp dir.")
             cf.model_path = os.path.join(exp_path, 'model.py')
             cf.backbone_path = os.path.join(exp_path, 'backbone.py')
         else: # this case overwrites settings files in exp dir, i.e., default_configs, configs, backbone, model
             if not os.path.exists(exp_path):
                 os.mkdir(exp_path)
             # run training with source code info and copy snapshot of model to exp_dir for later testing (overwrite scripts if exp_dir already exists.)
             subprocess.call('cp {} {}'.format('default_configs.py', os.path.join(exp_path, 'default_configs.py')), shell=True)
             subprocess.call('cp {} {}'.format(os.path.join(dataset_path, 'configs.py'), os.path.join(exp_path, 'configs.py')), shell=True)
             cf_file = import_module('cf_file', os.path.join(dataset_path, 'configs.py'))
             cf = cf_file.Configs(server_env)
             subprocess.call('cp {} {}'.format(cf.model_path, os.path.join(exp_path, 'model.py')), shell=True)
             subprocess.call('cp {} {}'.format(cf.backbone_path, os.path.join(exp_path, 'backbone.py')), shell=True)
             if os.path.isfile(os.path.join(exp_path, "fold_ids.pickle")):
                 subprocess.call('rm {}'.format(os.path.join(exp_path, "fold_ids.pickle")), shell=True)
 
     else: # testing, use model and backbone stored in exp dir.
         cf_file = import_module('cf', os.path.join(exp_path, 'configs.py'))
         cf = cf_file.Configs(server_env)
         cf.model_path = os.path.join(exp_path, 'model.py')
         cf.backbone_path = os.path.join(exp_path, 'backbone.py')
 
     cf.exp_dir = exp_path
     cf.test_dir = os.path.join(cf.exp_dir, 'test')
     cf.plot_dir = os.path.join(cf.exp_dir, 'plots')
     if not os.path.exists(cf.test_dir):
         os.mkdir(cf.test_dir)
     if not os.path.exists(cf.plot_dir):
         os.mkdir(cf.plot_dir)
     cf.experiment_name = exp_path.split("/")[-1]
     cf.dataset_name = dataset_path
     cf.server_env = server_env
     cf.created_fold_id_pickle = False
 
     return cf
 
 class ModelSelector:
     '''
     saves a checkpoint after each epoch as 'last_state' (can be loaded to continue interrupted training).
     saves the top-k (k=cf.save_n_models) ranked epochs. In inference, predictions of multiple epochs can be ensembled
     to improve performance.
     '''
 
     def __init__(self, cf, logger):
 
         self.cf = cf
         self.saved_epochs = [-1] * cf.save_n_models
         self.logger = logger
 
 
     def run_model_selection(self, net, optimizer, monitor_metrics, epoch):
         """rank epoch via weighted mean from self.cf.model_selection_criteria: {criterion : weight}
         :param net:
         :param optimizer:
         :param monitor_metrics:
         :param epoch:
         :return:
         """
         crita = self.cf.model_selection_criteria #shorter alias
 
         non_nan_scores = {}
         for criterion in crita.keys():
             #exclude first entry bc its dummy None entry
             non_nan_scores[criterion] = [0 if (ii is None or np.isnan(ii)) else ii for ii in monitor_metrics['val'][criterion]][1:]
             n_epochs = len(non_nan_scores[criterion])
         epochs_scores = []
         for e_ix in range(n_epochs):
             epochs_scores.append(np.sum([weight * non_nan_scores[criterion][e_ix] for
                                          criterion,weight in crita.items()])/len(crita.keys()))
 
         # ranking of epochs according to model_selection_criterion
         epoch_ranking = np.argsort(epochs_scores)[::-1] + 1 #epochs start at 1
 
         # if set in configs, epochs < min_save_thresh are discarded from saving process.
         epoch_ranking = epoch_ranking[epoch_ranking >= self.cf.min_save_thresh]
 
         # check if current epoch is among the top-k epchs.
         if epoch in epoch_ranking[:self.cf.save_n_models]:
             if self.cf.server_env:
                 IO_safe(torch.save, net.state_dict(), os.path.join(self.cf.fold_dir, '{}_best_params.pth'.format(epoch)))
                 # save epoch_ranking to keep info for inference.
                 IO_safe(np.save, os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
             else:
                 torch.save(net.state_dict(), os.path.join(self.cf.fold_dir, '{}_best_params.pth'.format(epoch)))
                 np.save(os.path.join(self.cf.fold_dir, 'epoch_ranking'), epoch_ranking[:self.cf.save_n_models])
             self.logger.info(
                 "saving current epoch {} at rank {}".format(epoch, np.argwhere(epoch_ranking == epoch)))
             # delete params of the epoch that just fell out of the top-k epochs.
             for se in [int(ii.split('_')[0]) for ii in os.listdir(self.cf.fold_dir) if 'best_params' in ii]:
                 if se in epoch_ranking[self.cf.save_n_models:]:
                     subprocess.call('rm {}'.format(os.path.join(self.cf.fold_dir, '{}_best_params.pth'.format(se))),
                                     shell=True)
                     self.logger.info('deleting epoch {} at rank {}'.format(se, np.argwhere(epoch_ranking == se)))
 
         state = {
             'epoch': epoch,
             'state_dict': net.state_dict(),
             'optimizer': optimizer.state_dict(),
         }
 
         if self.cf.server_env:
             IO_safe(torch.save, state, os.path.join(self.cf.fold_dir, 'last_state.pth'))
         else:
             torch.save(state, os.path.join(self.cf.fold_dir, 'last_state.pth'))
 
 
 def load_checkpoint(checkpoint_path, net, optimizer):
 
     checkpoint = torch.load(checkpoint_path)
     net.load_state_dict(checkpoint['state_dict'])
     optimizer.load_state_dict(checkpoint['optimizer'])
     return checkpoint['epoch']
 
 
 def prepare_monitoring(cf):
     """
     creates dictionaries, where train/val metrics are stored.
     """
     metrics = {}
     # first entry for loss dict accounts for epoch starting at 1.
     metrics['train'] = OrderedDict()# [(l_name, [np.nan]) for l_name in cf.losses_to_monitor] )
     metrics['val'] = OrderedDict()# [(l_name, [np.nan]) for l_name in cf.losses_to_monitor] )
     metric_classes = []
     if 'rois' in cf.report_score_level:
         metric_classes.extend([v for k, v in cf.class_dict.items()])
         if hasattr(cf, "eval_bins_separately") and cf.eval_bins_separately:
             metric_classes.extend([v for k, v in cf.bin_dict.items()])
     if 'patient' in cf.report_score_level:
         metric_classes.extend(['patient_'+cf.class_dict[cf.patient_class_of_interest]])
         if hasattr(cf, "eval_bins_separately") and cf.eval_bins_separately:
             metric_classes.extend(['patient_' + cf.bin_dict[cf.patient_bin_of_interest]])
     for cl in metric_classes:
         for m in cf.metrics:
             metrics['train'][cl + '_' + m] = [np.nan]
             metrics['val'][cl + '_' + m] = [np.nan]
 
     return metrics
 
 
 class _AnsiColorizer(object):
     """
     A colorizer is an object that loosely wraps around a stream, allowing
     callers to write text to the stream in a particular color.
 
     Colorizer classes must implement C{supported()} and C{write(text, color)}.
     """
     _colors = dict(black=30, red=31, green=32, yellow=33,
                    blue=34, magenta=35, cyan=36, white=37, default=39)
 
     def __init__(self, stream):
         self.stream = stream
 
     @classmethod
     def supported(cls, stream=sys.stdout):
         """
         A class method that returns True if the current platform supports
         coloring terminal output using this method. Returns False otherwise.
         """
         if not stream.isatty():
             return False  # auto color only on TTYs
         try:
             import curses
         except ImportError:
             return False
         else:
             try:
                 try:
                     return curses.tigetnum("colors") > 2
                 except curses.error:
                     curses.setupterm()
                     return curses.tigetnum("colors") > 2
             except:
                 raise
                 # guess false in case of error
                 return False
 
     def write(self, text, color):
         """
         Write the given text to the stream in the given color.
 
         @param text: Text to be written to the stream.
 
         @param color: A string label for a color. e.g. 'red', 'white'.
         """
         color = self._colors[color]
         self.stream.write('\x1b[%sm%s\x1b[0m' % (color, text))
 
 class ColorHandler(logging.StreamHandler):
 
 
     def __init__(self, stream=sys.stdout):
         super(ColorHandler, self).__init__(_AnsiColorizer(stream))
 
     def emit(self, record):
         msg_colors = {
             logging.DEBUG: "green",
             logging.INFO: "default",
             logging.WARNING: "red",
             logging.ERROR: "red"
         }
         color = msg_colors.get(record.levelno, "blue")
         self.stream.write(record.msg + "\n", color)