forked from CharlesShang/DCNv2
-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from lbin/pytorch_1.7
Pytorch 1.7
- Loading branch information
Showing
13 changed files
with
2,176 additions
and
356 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,229 @@ | ||
#include <vector> | ||
#include "cpu/dcn_v2_im2col_cpu.h" | ||
#include <iostream> | ||
|
||
#include <ATen/ATen.h> | ||
#include <ATen/cuda/CUDAContext.h> | ||
//#include <ATen/cuda/CUDAContext.h> | ||
|
||
#include <TH/TH.h> | ||
//#include <THC/THCAtomics.cuh> | ||
//#include <THC/THCDeviceUtils.cuh> | ||
|
||
//extern THCState *state; | ||
|
||
// author: Charles Shang | ||
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu | ||
|
||
// modified from the CUDA version for CPU use by Daniel K. Suhendro | ||
|
||
// edit by: James Bockman and Matthew Howe | ||
// modified for torch implementation to remove use of deprecated torch access to Blas | ||
|
||
at::Tensor | ||
dcn_v2_cpu_forward(const at::Tensor &input, | ||
const at::Tensor &weight, | ||
const at::Tensor &bias, | ||
const at::Tensor &offset, | ||
const at::Tensor &mask, | ||
const int kernel_h, | ||
const int kernel_w, | ||
const int stride_h, | ||
const int stride_w, | ||
const int pad_h, | ||
const int pad_w, | ||
const int dilation_h, | ||
const int dilation_w, | ||
const int deformable_group) | ||
{ | ||
AT_ERROR("Not implement on cpu"); | ||
} | ||
|
||
std::vector<at::Tensor> | ||
dcn_v2_cpu_backward(const at::Tensor &input, | ||
const at::Tensor &weight, | ||
const at::Tensor &bias, | ||
const at::Tensor &offset, | ||
const at::Tensor &mask, | ||
const at::Tensor &grad_output, | ||
int kernel_h, int kernel_w, | ||
int stride_h, int stride_w, | ||
int pad_h, int pad_w, | ||
int dilation_h, int dilation_w, | ||
int deformable_group) | ||
const int kernel_h, | ||
const int kernel_w, | ||
const int stride_h, | ||
const int stride_w, | ||
const int pad_h, | ||
const int pad_w, | ||
const int dilation_h, | ||
const int dilation_w, | ||
const int deformable_group) | ||
{ | ||
AT_ERROR("Not implement on cpu"); | ||
} | ||
// THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); | ||
/*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); | ||
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); | ||
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); | ||
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); | ||
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ | ||
|
||
std::tuple<at::Tensor, at::Tensor> | ||
dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, | ||
const at::Tensor &bbox, | ||
const at::Tensor &trans, | ||
const int no_trans, | ||
const float spatial_scale, | ||
const int output_dim, | ||
const int group_size, | ||
const int pooled_size, | ||
const int part_size, | ||
const int sample_per_part, | ||
const float trans_std) | ||
{ | ||
AT_ERROR("Not implement on cpu"); | ||
const int batch = input.size(0); | ||
const int channels = input.size(1); | ||
const int height = input.size(2); | ||
const int width = input.size(3); | ||
|
||
const int channels_out = weight.size(0); | ||
const int channels_kernel = weight.size(1); | ||
const int kernel_h_ = weight.size(2); | ||
const int kernel_w_ = weight.size(3); | ||
|
||
// printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); | ||
// printf("Channels: %d %d\n", channels, channels_kernel); | ||
// printf("Channels: %d %d\n", channels_out, channels_kernel); | ||
|
||
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, | ||
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); | ||
|
||
AT_ASSERTM(channels == channels_kernel, | ||
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); | ||
|
||
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; | ||
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; | ||
|
||
// auto ones = at::ones({height_out, width_out}, input.options()); | ||
auto ones = at::ones({bias.sizes()[0], height_out, width_out}, input.options()); | ||
auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); | ||
auto output = at::zeros({batch, channels_out, height_out, width_out}, input.options()); | ||
|
||
using scalar_t = float; | ||
for (int b = 0; b < batch; b++) | ||
{ | ||
auto input_n = input.select(0, b); | ||
auto offset_n = offset.select(0, b); | ||
auto mask_n = mask.select(0, b); | ||
auto output_n = output.select(0, b); | ||
// std::cout << "output_n: " << output_n << "output.select(0,b): " << output.select(0,b) << "\n"; | ||
|
||
// Do Bias first: | ||
// M,N,K are dims of matrix A and B | ||
// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) | ||
// (N x 1) (1 x M) | ||
|
||
// torch implementation | ||
auto ones_T = at::transpose(ones.contiguous(), 2, 0); | ||
ones_T = at::mul(ones_T, bias.contiguous()); | ||
ones_T = at::transpose(ones_T, 2, 0); | ||
output_n = at::add(output_n, ones_T); | ||
|
||
modulated_deformable_im2col_cpu(input_n.data_ptr<scalar_t>(), | ||
offset_n.data_ptr<scalar_t>(), | ||
mask_n.data_ptr<scalar_t>(), | ||
1, channels, height, width, | ||
height_out, width_out, kernel_h, kernel_w, | ||
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, | ||
deformable_group, | ||
columns.data_ptr<scalar_t>()); | ||
|
||
//(k * m) x (m * n) | ||
// Y = WC | ||
|
||
// torch implementation | ||
auto weight_flat = weight.view({channels_out, channels * kernel_h * kernel_w}); | ||
auto product = at::matmul(weight_flat, columns); | ||
output.select(0, b) = at::add(output_n, product.view({channels_out, height_out, width_out})); | ||
} | ||
return output; | ||
} | ||
|
||
std::tuple<at::Tensor, at::Tensor> | ||
dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, | ||
const at::Tensor &input, | ||
const at::Tensor &bbox, | ||
const at::Tensor &trans, | ||
const at::Tensor &top_count, | ||
const int no_trans, | ||
const float spatial_scale, | ||
const int output_dim, | ||
const int group_size, | ||
const int pooled_size, | ||
const int part_size, | ||
const int sample_per_part, | ||
const float trans_std) | ||
std::vector<at::Tensor> dcn_v2_cpu_backward(const at::Tensor &input, | ||
const at::Tensor &weight, | ||
const at::Tensor &bias, | ||
const at::Tensor &offset, | ||
const at::Tensor &mask, | ||
const at::Tensor &grad_output, | ||
int kernel_h, int kernel_w, | ||
int stride_h, int stride_w, | ||
int pad_h, int pad_w, | ||
int dilation_h, int dilation_w, | ||
int deformable_group) | ||
{ | ||
AT_ERROR("Not implement on cpu"); | ||
} | ||
|
||
THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); | ||
THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); | ||
|
||
/*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); | ||
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); | ||
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); | ||
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); | ||
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ | ||
|
||
const int batch = input.size(0); | ||
const int channels = input.size(1); | ||
const int height = input.size(2); | ||
const int width = input.size(3); | ||
|
||
const int channels_out = weight.size(0); | ||
const int channels_kernel = weight.size(1); | ||
const int kernel_h_ = weight.size(2); | ||
const int kernel_w_ = weight.size(3); | ||
|
||
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, | ||
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); | ||
|
||
AT_ASSERTM(channels == channels_kernel, | ||
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); | ||
|
||
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; | ||
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; | ||
|
||
auto ones = at::ones({height_out, width_out}, input.options()); | ||
auto columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); | ||
auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); | ||
|
||
auto grad_input = at::zeros_like(input); | ||
auto grad_weight = at::zeros_like(weight); | ||
auto grad_bias = at::zeros_like(bias); | ||
auto grad_offset = at::zeros_like(offset); | ||
auto grad_mask = at::zeros_like(mask); | ||
|
||
using scalar_t = float; | ||
|
||
for (int b = 0; b < batch; b++) | ||
{ | ||
auto input_n = input.select(0, b); | ||
auto offset_n = offset.select(0, b); | ||
auto mask_n = mask.select(0, b); | ||
auto grad_output_n = grad_output.select(0, b); | ||
auto grad_input_n = grad_input.select(0, b); | ||
auto grad_offset_n = grad_offset.select(0, b); | ||
auto grad_mask_n = grad_mask.select(0, b); | ||
|
||
|
||
|
||
// Torch implementation | ||
auto weight_flat = weight.view({channels_out, channels*kernel_h*kernel_w}); | ||
weight_flat = at::transpose(weight_flat, 1, 0); | ||
auto grad_output_n_flat = grad_output_n.view({channels_out, height_out*width_out}); | ||
columns = at::matmul(weight_flat, grad_output_n_flat); | ||
|
||
// gradient w.r.t. input coordinate data | ||
modulated_deformable_col2im_coord_cpu(columns.data_ptr<scalar_t>(), | ||
input_n.data_ptr<scalar_t>(), | ||
offset_n.data_ptr<scalar_t>(), | ||
mask_n.data_ptr<scalar_t>(), | ||
1, channels, height, width, | ||
height_out, width_out, kernel_h, kernel_w, | ||
pad_h, pad_w, stride_h, stride_w, | ||
dilation_h, dilation_w, deformable_group, | ||
grad_offset_n.data_ptr<scalar_t>(), | ||
grad_mask_n.data_ptr<scalar_t>()); | ||
// gradient w.r.t. input data | ||
modulated_deformable_col2im_cpu(columns.data_ptr<scalar_t>(), | ||
offset_n.data_ptr<scalar_t>(), | ||
mask_n.data_ptr<scalar_t>(), | ||
1, channels, height, width, | ||
height_out, width_out, kernel_h, kernel_w, | ||
pad_h, pad_w, stride_h, stride_w, | ||
dilation_h, dilation_w, deformable_group, | ||
grad_input_n.data_ptr<scalar_t>()); | ||
|
||
// gradient w.r.t. weight, dWeight should accumulate across the batch and group | ||
modulated_deformable_im2col_cpu(input_n.data_ptr<scalar_t>(), | ||
offset_n.data_ptr<scalar_t>(), | ||
mask_n.data_ptr<scalar_t>(), | ||
1, channels, height, width, | ||
height_out, width_out, kernel_h, kernel_w, | ||
pad_h, pad_w, stride_h, stride_w, | ||
dilation_h, dilation_w, deformable_group, | ||
columns.data_ptr<scalar_t>()); | ||
|
||
// Torch implementation | ||
auto product = at::matmul(grad_output_n_flat, at::transpose(columns, 1, 0)); | ||
grad_weight = at::add(grad_weight, product.view({channels_out, channels, kernel_h, kernel_w})); | ||
|
||
|
||
// Torch implementation | ||
auto ones_flat = ones.view({height_out*width_out}); | ||
product = at::matmul(grad_output_n_flat, ones_flat); | ||
grad_bias = at::add(grad_bias, product); | ||
} | ||
|
||
return { | ||
grad_input, grad_offset, grad_mask, grad_weight, grad_bias | ||
}; | ||
} |
Oops, something went wrong.