Skip to content

Commit

Permalink
Added support for palettize weights for convolution on CUDA.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Sep 7, 2023
1 parent a1f16d6 commit 694ce11
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 21 deletions.
8 changes: 7 additions & 1 deletion lib/nnc/ccv_nnc_easy.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ static inline ccv_nnc_tensor_param_t ccv_nnc_tensor_palettize(const ccv_nnc_tens
return new_params;
}

static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t params)
static inline size_t ccv_nnc_tensor_data_size_without_padding(const ccv_nnc_tensor_param_t params)
{
const ssize_t count = (ssize_t)ccv_nnc_tensor_count(params);
ssize_t data_size;
Expand All @@ -232,6 +232,12 @@ static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t param
data_size = (ssize_t)(1 << qbits) * CCV_GET_DATA_TYPE_SIZE(palette_datatype) * num_blocks + (count + 7) * qbits / 8;
} else
data_size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * count;
return data_size;
}

static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t params)
{
ssize_t data_size = ccv_nnc_tensor_data_size_without_padding(params);
#ifdef HAVE_CUDA // For CUDA, we align to 128-bytes.
if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY)
return ((data_size + 127) & -128);
Expand Down
88 changes: 74 additions & 14 deletions lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,30 @@ static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint
size_t workspace_size = 0;
CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(cudnn, a.descriptor, w.descriptor, conv.descriptor, b.descriptor, algo, &workspace_size));
void* workspace = 0;
// TODO: If error, return OOM
if (workspace_size)
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY);
void* weight_data = w.data.u8;
if (CCV_GET_DATA_TYPE(inputs[1]->info.datatype) == CCV_QX)
{
ccv_nnc_tensor_param_t weight_params = inputs[1]->info;
const size_t count = ccv_nnc_tensor_count(weight_params);
const int palette_datatype = (weight_params.datatype & 0xff) << 12;
const int qbits = (weight_params.datatype & 0xf00) >> 8;
const int number_in_blocks = weight_params.reserved;
ccv_nnc_tensor_param_t depalettize_weight_params = weight_params;
depalettize_weight_params.datatype = palette_datatype;
depalettize_weight_params.reserved = 0;
const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params);
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size + data_size, CCV_TENSOR_GPU_MEMORY);
weight_data = (uint8_t*)workspace + workspace_size;
ccv_nnc_compat_depalettize(w.data.u8, palette_datatype, ccv_nnc_tensor_data_size_without_padding(weight_params), qbits, number_in_blocks, weight_data, count, stream_context);
if (workspace_size == 0)
workspace = 0;
} else {
// TODO: If error, return OOM
if (workspace_size)
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY);
}
static const float one = 1, zero = 0;
CUDNN_ENFORCE(cudnnConvolutionForward(cudnn, &one, a.descriptor, a.data.u8, w.descriptor, w.data.u8, conv.descriptor, algo, workspace, workspace_size, &zero, b.descriptor, b.data.u8));
CUDNN_ENFORCE(cudnnConvolutionForward(cudnn, &one, a.descriptor, a.data.u8, w.descriptor, weight_data, conv.descriptor, algo, workspace, workspace_size, &zero, b.descriptor, b.data.u8));
if (input_size > 2 && inputs[2])
{
const ccv_nnc_cudnn_tensor_view_descriptor_t bias = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[2]);
Expand Down Expand Up @@ -108,7 +127,18 @@ static int _ccv_nnc_conv_forw_autotune(const ccv_nnc_cmd_t cmd, const size_t max
cudnnSetConvolutionGroupCount(conv.descriptor, cmd.info.convolution.groups);
int count = 0;
cudnnConvolutionFwdAlgoPerf_t perfs[CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT];
CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(cudnn, a.descriptor, a.data.u8, w.descriptor, w.data.u8, conv.descriptor, b.descriptor, b.data.u8, CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT, &count, perfs, workmem, max_workspace_size));
void* weight_data = w.data.u8;
if (CCV_GET_DATA_TYPE(inputs[1]->info.datatype) == CCV_QX)
{
ccv_nnc_tensor_param_t weight_params = inputs[1]->info;
const int palette_datatype = (weight_params.datatype & 0xff) << 12;
ccv_nnc_tensor_param_t depalettize_weight_params = weight_params;
depalettize_weight_params.datatype = palette_datatype;
depalettize_weight_params.reserved = 0;
const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params);
weight_data = ccv_nnc_stream_context_get_workspace(stream_context, data_size, CCV_TENSOR_GPU_MEMORY);
}
CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(cudnn, a.descriptor, a.data.u8, w.descriptor, weight_data, conv.descriptor, b.descriptor, b.data.u8, CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT, &count, perfs, workmem, max_workspace_size));
int i;
cudnnConvolutionFwdAlgo_t algorithm;
for(i = 0; i < count; i++)
Expand Down Expand Up @@ -289,10 +319,29 @@ static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint
size_t workspace_size = 0;
CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn, w.descriptor, g.descriptor, conv.descriptor, h.descriptor, data_algo, &workspace_size));
void* workspace = 0;
// TODO: If error, return OOM
if (workspace_size)
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY);
CUDNN_ENFORCE(cudnnConvolutionBackwardData(cudnn, &one, w.descriptor, w.data.u8, g.descriptor, g.data.u8, conv.descriptor, data_algo, workspace, workspace_size, &zero, h.descriptor, h.data.u8));
void* weight_data = w.data.u8;
if (CCV_GET_DATA_TYPE(inputs[2]->info.datatype) == CCV_QX)
{
ccv_nnc_tensor_param_t weight_params = inputs[2]->info;
const size_t count = ccv_nnc_tensor_count(weight_params);
const int palette_datatype = (weight_params.datatype & 0xff) << 12;
const int qbits = (weight_params.datatype & 0xf00) >> 8;
const int number_in_blocks = weight_params.reserved;
ccv_nnc_tensor_param_t depalettize_weight_params = weight_params;
depalettize_weight_params.datatype = palette_datatype;
depalettize_weight_params.reserved = 0;
const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params);
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size + data_size, CCV_TENSOR_GPU_MEMORY);
weight_data = (uint8_t*)workspace + workspace_size;
ccv_nnc_compat_depalettize(w.data.u8, palette_datatype, ccv_nnc_tensor_data_size_without_padding(weight_params), qbits, number_in_blocks, weight_data, count, stream_context);
if (workspace_size == 0)
workspace = 0;
} else {
// TODO: If error, return OOM
if (workspace_size)
workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY);
}
CUDNN_ENFORCE(cudnnConvolutionBackwardData(cudnn, &one, w.descriptor, weight_data, g.descriptor, g.data.u8, conv.descriptor, data_algo, workspace, workspace_size, &zero, h.descriptor, h.data.u8));
ccv_nnc_cudnn_deinit_filter_descriptor(w);
ccv_nnc_cudnn_deinit_tensor_view_descriptor(h);
}
Expand All @@ -311,7 +360,6 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max
if (max_workspace_size && !workmem)
return -1;
const ccv_nnc_cudnn_tensor_view_descriptor_t g = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[0]);
const ccv_nnc_cudnn_tensor_view_descriptor_t a = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[1]);
int i;
int count = 0;
const int is_w_nhwc = (output_size > 1 && outputs[1]) ? outputs[1]->info.format == CCV_TENSOR_FORMAT_NHWC : inputs[2]->info.format == CCV_TENSOR_FORMAT_NHWC;
Expand All @@ -321,6 +369,7 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max
cudnnConvolutionBwdFilterAlgo_t filter_algorithm = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
if (output_size > 1 && outputs[1])
{
const ccv_nnc_cudnn_tensor_view_descriptor_t a = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[1]);
const ccv_nnc_cudnn_filter_descriptor_t dw = ccv_nnc_cudnn_get_filter_descriptor(stream_context, (const ccv_nnc_tensor_t*)outputs[1]);
cudnnConvolutionBwdFilterAlgoPerf_t filter_perfs[CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT];
CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnn, a.descriptor, a.data.u8, g.descriptor, g.data.u8, conv.descriptor, dw.descriptor, dw.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT, &count, filter_perfs, workmem, max_workspace_size));
Expand All @@ -330,6 +379,7 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max
filter_algorithm = filter_perfs[i].algo;
break;
}
ccv_nnc_cudnn_deinit_tensor_view_descriptor(a);
ccv_nnc_cudnn_deinit_filter_descriptor(dw);
}
cudnnConvolutionBwdDataAlgo_t data_algorithm = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
Expand All @@ -338,7 +388,18 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max
const ccv_nnc_cudnn_filter_descriptor_t w = ccv_nnc_cudnn_get_filter_descriptor(stream_context, (const ccv_nnc_tensor_t*)inputs[2]);
const ccv_nnc_cudnn_tensor_view_descriptor_t h = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)outputs[0]);
cudnnConvolutionBwdDataAlgoPerf_t data_perfs[CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT];
CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(cudnn, w.descriptor, w.data.u8, g.descriptor, g.data.u8, conv.descriptor, h.descriptor, h.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT, &count, data_perfs, workmem, max_workspace_size));
void* weight_data = w.data.u8;
if (CCV_GET_DATA_TYPE(inputs[2]->info.datatype) == CCV_QX)
{
ccv_nnc_tensor_param_t weight_params = inputs[2]->info;
const int palette_datatype = (weight_params.datatype & 0xff) << 12;
ccv_nnc_tensor_param_t depalettize_weight_params = weight_params;
depalettize_weight_params.datatype = palette_datatype;
depalettize_weight_params.reserved = 0;
const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params);
weight_data = ccv_nnc_stream_context_get_workspace(stream_context, data_size, CCV_TENSOR_GPU_MEMORY);
}
CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(cudnn, w.descriptor, weight_data, g.descriptor, g.data.u8, conv.descriptor, h.descriptor, h.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT, &count, data_perfs, workmem, max_workspace_size));
for(i = 0; i < count; i++)
if ((size_t)data_perfs[i].memory <= max_workspace_size && data_perfs[i].status == CUDNN_STATUS_SUCCESS)
{
Expand All @@ -348,7 +409,6 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max
ccv_nnc_cudnn_deinit_filter_descriptor(w);
ccv_nnc_cudnn_deinit_tensor_view_descriptor(h);
}
ccv_nnc_cudnn_deinit_tensor_view_descriptor(a);
ccv_nnc_cudnn_deinit_tensor_view_descriptor(g);
ccv_nnc_cudnn_deinit_convolution_descriptor(conv);
int filter = -1, data = -1;
Expand Down Expand Up @@ -409,7 +469,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)
{
#ifdef HAVE_CUDNN
registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC;
registry->tensor_datatypes = CCV_32F | CCV_16F;
registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_QX;
registry->tensor_memory = CCV_TENSOR_GPU_MEMORY;
registry->algorithms = CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT;
registry->exec = _ccv_nnc_conv_forw;
Expand All @@ -421,7 +481,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN
{
#ifdef HAVE_CUDNN
registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC;
registry->tensor_datatypes = CCV_32F | CCV_16F;
registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_QX;
registry->tensor_memory = CCV_TENSOR_GPU_MEMORY;
registry->algorithms = CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT * CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT;
registry->exec = _ccv_nnc_conv_back;
Expand Down
11 changes: 7 additions & 4 deletions lib/nnc/cmd/util/gpu/ccv_nnc_util_gpu_ref.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t
assert(CCV_IS_TENSOR_CONTIGUOUS(a));
assert(CCV_IS_TENSOR_CONTIGUOUS(b));
assert(ccv_nnc_tensor_count(a->info) == ccv_nnc_tensor_count(b->info));
assert(CCV_GET_DATA_TYPE_SIZE(a->info.datatype) == CCV_GET_DATA_TYPE_SIZE(b->info.datatype));
const size_t size = (ssize_t)ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype);
if (CCV_GET_DATA_TYPE(a->info.datatype) == CCV_QX)
{ assert(a->info.datatype == b->info.datatype); }
else
{ assert(CCV_GET_DATA_TYPE_SIZE(a->info.datatype) == CCV_GET_DATA_TYPE_SIZE(b->info.datatype)); }
const size_t size = ccv_nnc_tensor_data_size_without_padding(a->info);
if (stream_context)
{
cudaStream_t stream = ccv_nnc_stream_context_get_stream(stream_context);
Expand Down Expand Up @@ -61,7 +64,7 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_64S | CCV_32S | CCV_8U | CCV_QX;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY | CCV_TENSOR_GPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_data_transfer;
Expand All @@ -70,7 +73,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_GPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_64S | CCV_32S | CCV_8U | CCV_QX;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY | CCV_TENSOR_GPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_data_transfer;
Expand Down
4 changes: 3 additions & 1 deletion lib/nnc/gpu/ccv_nnc_compat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -770,8 +770,10 @@ cudaDataType_t ccv_nnc_cuda_compute_datatype(const int datatype)

#ifdef HAVE_CUDNN

cudnnDataType_t ccv_nnc_cudnn_datatype(const int datatype)
cudnnDataType_t ccv_nnc_cudnn_datatype(int datatype)
{
if (CCV_GET_DATA_TYPE(datatype) == CCV_QX)
datatype = (datatype & 0xff) << 12;
switch (datatype)
{
case CCV_8U:
Expand Down
70 changes: 69 additions & 1 deletion test/int/nnc/cudnn.tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ TEST_CASE("cudnn forward convolution in half precision")
cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(cmd.backend >= 0);
cmd.algorithm = -1;
cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
ccv_nnc_stream_context_wait(stream_context);
ccv_nnc_stream_context_free(stream_context);
Expand All @@ -209,6 +209,74 @@ TEST_CASE("cudnn forward convolution in half precision")
ccv_nnc_tensor_free(ga);
}

TEST_CASE("cudnn forward convolution in half precision with palettize weights")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN));
ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM);
cmd.backend = CCV_NNC_BACKEND_CPU_REF;
assert(cmd.backend >= 0);
ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info);
assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0);
ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* wo = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0);
// configure the inlets.
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 0);
int i;
for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++)
w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE);
for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++)
a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
for (i = 0; i < OUTPUT_DIM; i++)
bias->data.f32[i] = (float)i / OUTPUT_DIM;
ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wo), 0);
ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* w1o = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0);
ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0);
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wo, bias), TENSOR_LIST(a1, w1o, bias1), 0);
ccv_nnc_tensor_t* pw1o = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0);
(void)ccv_nnc_palettize(w1o->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, ccv_nnc_tensor_count(w1o->info), 6, 1280, pw1o->data.u8, ccv_nnc_tensor_data_size_without_padding(pw1o->info));
// Copy generated matrix values over to GPU.
ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0);
ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0);
ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, pw1o, bias1), TENSOR_LIST(ga, gwo, gbias), 0);
ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);

ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN;
assert(cmd.backend >= 0);
cmd.algorithm = -1;
cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context);
assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context));
ccv_nnc_stream_context_wait(stream_context);
ccv_nnc_stream_context_free(stream_context);
ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0);
ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0);
ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0);
REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from cudnn should match from CPU");
ccv_nnc_tensor_free(c);
ccv_nnc_tensor_free(gc);
ccv_nnc_tensor_free(bias);
ccv_nnc_tensor_free(w);
ccv_nnc_tensor_free(wo);
ccv_nnc_tensor_free(b);
ccv_nnc_tensor_free(a);
ccv_nnc_tensor_free(c1);
ccv_nnc_tensor_free(bias1);
ccv_nnc_tensor_free(w1o);
ccv_nnc_tensor_free(pw1o);
ccv_nnc_tensor_free(a1);
ccv_nnc_tensor_free(gbias);
ccv_nnc_tensor_free(gwo);
ccv_nnc_tensor_free(ga);
}

TEST_CASE("cudnn backward convolution")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));
Expand Down

0 comments on commit 694ce11

Please sign in to comment.