Merge pull request #10 from lbin/pytorch_1.7

Pytorch 1.7
lbin · Nov 23, 2020 · 711fe28 · 711fe28
2 parents bc8d039 + 41c9ebb
commit 711fe28
Show file tree

Hide file tree

Showing 13 changed files with 2,176 additions and 356 deletions.
diff --git a/dcn_v2.py b/dcn_v2.py
diff --git a/setup.py b/setup.py
@@ -1,19 +1,15 @@
 #!/usr/bin/env python
 
-import os
 import glob
+import os
 
 import torch
-
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
 
 requirements = ["torch", "torchvision"]
 
+
 def get_extensions():
     this_dir = os.path.dirname(os.path.abspath(__file__))
     extensions_dir = os.path.join(this_dir, "src")
@@ -22,6 +18,7 @@ def get_extensions():
     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
 
+    os.environ["CC"] = "g++"
     sources = main_file + source_cpu
     extension = CppExtension
     extra_compile_args = {"cxx": []}
@@ -38,7 +35,8 @@ def get_extensions():
             "-D__CUDA_NO_HALF2_OPERATORS__",
         ]
     else:
-        raise NotImplementedError('Cuda is not availabel')
+        # raise NotImplementedError('Cuda is not available')
+        pass
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
     include_dirs = [extensions_dir]
@@ -53,14 +51,20 @@ def get_extensions():
     ]
     return ext_modules
 
+
 setup(
     name="DCNv2",
     version="0.1",
     author="charlesshang",
     url="https://github.com/charlesshang/DCNv2",
     description="deformable convolutional networks",
-    packages=find_packages(exclude=("configs", "tests",)),
+    packages=find_packages(
+        exclude=(
+            "configs",
+            "tests",
+        )
+    ),
     # install_requires=requirements,
     ext_modules=get_extensions(),
     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
+)
diff --git a/src/cpu/dcn_v2_cpu.cpp b/src/cpu/dcn_v2_cpu.cpp
@@ -1,74 +1,229 @@
 #include <vector>
+#include "cpu/dcn_v2_im2col_cpu.h"
+#include <iostream>
 
 #include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
+//#include <ATen/cuda/CUDAContext.h>
 
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+//extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+// edit by: James Bockman and Matthew Howe
+// modified for torch implementation to remove use of deprecated torch access to Blas
 
 at::Tensor
 dcn_v2_cpu_forward(const at::Tensor &input,
-                   const at::Tensor &weight,
-                   const at::Tensor &bias,
-                   const at::Tensor &offset,
-                   const at::Tensor &mask,
-                   const int kernel_h,
-                   const int kernel_w,
-                   const int stride_h,
-                   const int stride_w,
-                   const int pad_h,
-                   const int pad_w,
-                   const int dilation_h,
-                   const int dilation_w,
-                   const int deformable_group)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
-std::vector<at::Tensor>
-dcn_v2_cpu_backward(const at::Tensor &input,
                     const at::Tensor &weight,
                     const at::Tensor &bias,
                     const at::Tensor &offset,
                     const at::Tensor &mask,
-                    const at::Tensor &grad_output,
-                    int kernel_h, int kernel_w,
-                    int stride_h, int stride_w,
-                    int pad_h, int pad_w,
-                    int dilation_h, int dilation_w,
-                    int deformable_group)
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
 {
-    AT_ERROR("Not implement on cpu");
-}
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
 
-std::tuple<at::Tensor, at::Tensor>
-dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
-                                 const at::Tensor &bbox,
-                                 const at::Tensor &trans,
-                                 const int no_trans,
-                                 const float spatial_scale,
-                                 const int output_dim,
-                                 const int group_size,
-                                 const int pooled_size,
-                                 const int part_size,
-                                 const int sample_per_part,
-                                 const float trans_std)
-{
-    AT_ERROR("Not implement on cpu");
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    // auto ones = at::ones({height_out, width_out}, input.options());
+    auto ones = at::ones({bias.sizes()[0], height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::zeros({batch, channels_out, height_out, width_out}, input.options());
+
+    using scalar_t = float;
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto output_n = output.select(0, b);
+        // std::cout << "output_n: " << output_n << "output.select(0,b): " << output.select(0,b) << "\n"; 
+
+        // Do Bias first:
+        // M,N,K are dims of matrix A and B
+        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+        // (N x 1) (1 x M)
+
+        // torch implementation
+        auto ones_T = at::transpose(ones.contiguous(), 2, 0);
+        ones_T = at::mul(ones_T, bias.contiguous());
+        ones_T = at::transpose(ones_T, 2, 0);
+        output_n = at::add(output_n, ones_T);
+
+        modulated_deformable_im2col_cpu(input_n.data_ptr<scalar_t>(),
+                                         offset_n.data_ptr<scalar_t>(),
+                                         mask_n.data_ptr<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                         deformable_group,
+                                         columns.data_ptr<scalar_t>());
+
+        //(k * m)  x  (m * n)
+        // Y = WC
+
+        // torch implementation
+        auto weight_flat = weight.view({channels_out, channels * kernel_h * kernel_w});
+        auto product = at::matmul(weight_flat, columns);
+        output.select(0, b) = at::add(output_n, product.view({channels_out, height_out, width_out}));
+    }
+    return output;
 }
 
-std::tuple<at::Tensor, at::Tensor>
-dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
-                                  const at::Tensor &input,
-                                  const at::Tensor &bbox,
-                                  const at::Tensor &trans,
-                                  const at::Tensor &top_count,
-                                  const int no_trans,
-                                  const float spatial_scale,
-                                  const int output_dim,
-                                  const int group_size,
-                                  const int pooled_size,
-                                  const int part_size,
-                                  const int sample_per_part,
-                                  const float trans_std)
+std::vector<at::Tensor> dcn_v2_cpu_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
 {
-    AT_ERROR("Not implement on cpu");
-}
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+
+
+        // Torch implementation
+        auto weight_flat = weight.view({channels_out, channels*kernel_h*kernel_w});
+        weight_flat = at::transpose(weight_flat, 1, 0);
+        auto grad_output_n_flat = grad_output_n.view({channels_out, height_out*width_out});
+        columns = at::matmul(weight_flat, grad_output_n_flat);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cpu(columns.data_ptr<scalar_t>(),
+                                               input_n.data_ptr<scalar_t>(),
+                                               offset_n.data_ptr<scalar_t>(),
+                                               mask_n.data_ptr<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data_ptr<scalar_t>(),
+                                               grad_mask_n.data_ptr<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cpu(columns.data_ptr<scalar_t>(),
+                                         offset_n.data_ptr<scalar_t>(),
+                                         mask_n.data_ptr<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data_ptr<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cpu(input_n.data_ptr<scalar_t>(),
+                                         offset_n.data_ptr<scalar_t>(),
+                                         mask_n.data_ptr<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data_ptr<scalar_t>());
+
+        // Torch implementation
+        auto product = at::matmul(grad_output_n_flat, at::transpose(columns, 1, 0));
+        grad_weight = at::add(grad_weight, product.view({channels_out, channels, kernel_h, kernel_w}));
+
+
+        // Torch implementation
+        auto ones_flat = ones.view({height_out*width_out});
+        product = at::matmul(grad_output_n_flat, ones_flat);
+        grad_bias = at::add(grad_bias, product);
+    }
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}