diff --git a/lib/nnc/ccv_nnc_easy.h b/lib/nnc/ccv_nnc_easy.h index abc714f69..de1966a95 100644 --- a/lib/nnc/ccv_nnc_easy.h +++ b/lib/nnc/ccv_nnc_easy.h @@ -217,7 +217,7 @@ static inline ccv_nnc_tensor_param_t ccv_nnc_tensor_palettize(const ccv_nnc_tens return new_params; } -static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t params) +static inline size_t ccv_nnc_tensor_data_size_without_padding(const ccv_nnc_tensor_param_t params) { const ssize_t count = (ssize_t)ccv_nnc_tensor_count(params); ssize_t data_size; @@ -232,6 +232,12 @@ static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t param data_size = (ssize_t)(1 << qbits) * CCV_GET_DATA_TYPE_SIZE(palette_datatype) * num_blocks + (count + 7) * qbits / 8; } else data_size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * count; + return data_size; +} + +static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t params) +{ + ssize_t data_size = ccv_nnc_tensor_data_size_without_padding(params); #ifdef HAVE_CUDA // For CUDA, we align to 128-bytes. if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY) return ((data_size + 127) & -128); diff --git a/lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu b/lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu index 2cdf73c4a..3e81df7d5 100644 --- a/lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu +++ b/lib/nnc/cmd/convolution/gpu/ccv_nnc_conv_gpu_cudnn.cu @@ -75,11 +75,30 @@ static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint size_t workspace_size = 0; CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(cudnn, a.descriptor, w.descriptor, conv.descriptor, b.descriptor, algo, &workspace_size)); void* workspace = 0; - // TODO: If error, return OOM - if (workspace_size) - workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY); + void* weight_data = w.data.u8; + if (CCV_GET_DATA_TYPE(inputs[1]->info.datatype) == CCV_QX) + { + ccv_nnc_tensor_param_t weight_params = inputs[1]->info; + const size_t count = ccv_nnc_tensor_count(weight_params); + const int palette_datatype = (weight_params.datatype & 0xff) << 12; + const int qbits = (weight_params.datatype & 0xf00) >> 8; + const int number_in_blocks = weight_params.reserved; + ccv_nnc_tensor_param_t depalettize_weight_params = weight_params; + depalettize_weight_params.datatype = palette_datatype; + depalettize_weight_params.reserved = 0; + const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params); + workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size + data_size, CCV_TENSOR_GPU_MEMORY); + weight_data = (uint8_t*)workspace + workspace_size; + ccv_nnc_compat_depalettize(w.data.u8, palette_datatype, ccv_nnc_tensor_data_size_without_padding(weight_params), qbits, number_in_blocks, weight_data, count, stream_context); + if (workspace_size == 0) + workspace = 0; + } else { + // TODO: If error, return OOM + if (workspace_size) + workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY); + } static const float one = 1, zero = 0; - CUDNN_ENFORCE(cudnnConvolutionForward(cudnn, &one, a.descriptor, a.data.u8, w.descriptor, w.data.u8, conv.descriptor, algo, workspace, workspace_size, &zero, b.descriptor, b.data.u8)); + CUDNN_ENFORCE(cudnnConvolutionForward(cudnn, &one, a.descriptor, a.data.u8, w.descriptor, weight_data, conv.descriptor, algo, workspace, workspace_size, &zero, b.descriptor, b.data.u8)); if (input_size > 2 && inputs[2]) { const ccv_nnc_cudnn_tensor_view_descriptor_t bias = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[2]); @@ -108,7 +127,18 @@ static int _ccv_nnc_conv_forw_autotune(const ccv_nnc_cmd_t cmd, const size_t max cudnnSetConvolutionGroupCount(conv.descriptor, cmd.info.convolution.groups); int count = 0; cudnnConvolutionFwdAlgoPerf_t perfs[CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT]; - CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(cudnn, a.descriptor, a.data.u8, w.descriptor, w.data.u8, conv.descriptor, b.descriptor, b.data.u8, CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT, &count, perfs, workmem, max_workspace_size)); + void* weight_data = w.data.u8; + if (CCV_GET_DATA_TYPE(inputs[1]->info.datatype) == CCV_QX) + { + ccv_nnc_tensor_param_t weight_params = inputs[1]->info; + const int palette_datatype = (weight_params.datatype & 0xff) << 12; + ccv_nnc_tensor_param_t depalettize_weight_params = weight_params; + depalettize_weight_params.datatype = palette_datatype; + depalettize_weight_params.reserved = 0; + const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params); + weight_data = ccv_nnc_stream_context_get_workspace(stream_context, data_size, CCV_TENSOR_GPU_MEMORY); + } + CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(cudnn, a.descriptor, a.data.u8, w.descriptor, weight_data, conv.descriptor, b.descriptor, b.data.u8, CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT, &count, perfs, workmem, max_workspace_size)); int i; cudnnConvolutionFwdAlgo_t algorithm; for(i = 0; i < count; i++) @@ -289,10 +319,29 @@ static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint size_t workspace_size = 0; CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn, w.descriptor, g.descriptor, conv.descriptor, h.descriptor, data_algo, &workspace_size)); void* workspace = 0; - // TODO: If error, return OOM - if (workspace_size) - workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY); - CUDNN_ENFORCE(cudnnConvolutionBackwardData(cudnn, &one, w.descriptor, w.data.u8, g.descriptor, g.data.u8, conv.descriptor, data_algo, workspace, workspace_size, &zero, h.descriptor, h.data.u8)); + void* weight_data = w.data.u8; + if (CCV_GET_DATA_TYPE(inputs[2]->info.datatype) == CCV_QX) + { + ccv_nnc_tensor_param_t weight_params = inputs[2]->info; + const size_t count = ccv_nnc_tensor_count(weight_params); + const int palette_datatype = (weight_params.datatype & 0xff) << 12; + const int qbits = (weight_params.datatype & 0xf00) >> 8; + const int number_in_blocks = weight_params.reserved; + ccv_nnc_tensor_param_t depalettize_weight_params = weight_params; + depalettize_weight_params.datatype = palette_datatype; + depalettize_weight_params.reserved = 0; + const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params); + workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size + data_size, CCV_TENSOR_GPU_MEMORY); + weight_data = (uint8_t*)workspace + workspace_size; + ccv_nnc_compat_depalettize(w.data.u8, palette_datatype, ccv_nnc_tensor_data_size_without_padding(weight_params), qbits, number_in_blocks, weight_data, count, stream_context); + if (workspace_size == 0) + workspace = 0; + } else { + // TODO: If error, return OOM + if (workspace_size) + workspace = ccv_nnc_stream_context_get_workspace(stream_context, workspace_size, CCV_TENSOR_GPU_MEMORY); + } + CUDNN_ENFORCE(cudnnConvolutionBackwardData(cudnn, &one, w.descriptor, weight_data, g.descriptor, g.data.u8, conv.descriptor, data_algo, workspace, workspace_size, &zero, h.descriptor, h.data.u8)); ccv_nnc_cudnn_deinit_filter_descriptor(w); ccv_nnc_cudnn_deinit_tensor_view_descriptor(h); } @@ -311,7 +360,6 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max if (max_workspace_size && !workmem) return -1; const ccv_nnc_cudnn_tensor_view_descriptor_t g = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[0]); - const ccv_nnc_cudnn_tensor_view_descriptor_t a = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[1]); int i; int count = 0; const int is_w_nhwc = (output_size > 1 && outputs[1]) ? outputs[1]->info.format == CCV_TENSOR_FORMAT_NHWC : inputs[2]->info.format == CCV_TENSOR_FORMAT_NHWC; @@ -321,6 +369,7 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max cudnnConvolutionBwdFilterAlgo_t filter_algorithm = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; if (output_size > 1 && outputs[1]) { + const ccv_nnc_cudnn_tensor_view_descriptor_t a = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)inputs[1]); const ccv_nnc_cudnn_filter_descriptor_t dw = ccv_nnc_cudnn_get_filter_descriptor(stream_context, (const ccv_nnc_tensor_t*)outputs[1]); cudnnConvolutionBwdFilterAlgoPerf_t filter_perfs[CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT]; CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnn, a.descriptor, a.data.u8, g.descriptor, g.data.u8, conv.descriptor, dw.descriptor, dw.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT, &count, filter_perfs, workmem, max_workspace_size)); @@ -330,6 +379,7 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max filter_algorithm = filter_perfs[i].algo; break; } + ccv_nnc_cudnn_deinit_tensor_view_descriptor(a); ccv_nnc_cudnn_deinit_filter_descriptor(dw); } cudnnConvolutionBwdDataAlgo_t data_algorithm = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; @@ -338,7 +388,18 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max const ccv_nnc_cudnn_filter_descriptor_t w = ccv_nnc_cudnn_get_filter_descriptor(stream_context, (const ccv_nnc_tensor_t*)inputs[2]); const ccv_nnc_cudnn_tensor_view_descriptor_t h = ccv_nnc_cudnn_get_tensor_view_descriptor(stream_context, (const ccv_nnc_tensor_view_t*)outputs[0]); cudnnConvolutionBwdDataAlgoPerf_t data_perfs[CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT]; - CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(cudnn, w.descriptor, w.data.u8, g.descriptor, g.data.u8, conv.descriptor, h.descriptor, h.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT, &count, data_perfs, workmem, max_workspace_size)); + void* weight_data = w.data.u8; + if (CCV_GET_DATA_TYPE(inputs[2]->info.datatype) == CCV_QX) + { + ccv_nnc_tensor_param_t weight_params = inputs[2]->info; + const int palette_datatype = (weight_params.datatype & 0xff) << 12; + ccv_nnc_tensor_param_t depalettize_weight_params = weight_params; + depalettize_weight_params.datatype = palette_datatype; + depalettize_weight_params.reserved = 0; + const size_t data_size = ccv_nnc_tensor_data_size(depalettize_weight_params); + weight_data = ccv_nnc_stream_context_get_workspace(stream_context, data_size, CCV_TENSOR_GPU_MEMORY); + } + CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(cudnn, w.descriptor, weight_data, g.descriptor, g.data.u8, conv.descriptor, h.descriptor, h.data.u8, CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT, &count, data_perfs, workmem, max_workspace_size)); for(i = 0; i < count; i++) if ((size_t)data_perfs[i].memory <= max_workspace_size && data_perfs[i].status == CUDNN_STATUS_SUCCESS) { @@ -348,7 +409,6 @@ static int _ccv_nnc_conv_back_autotune(const ccv_nnc_cmd_t cmd, const size_t max ccv_nnc_cudnn_deinit_filter_descriptor(w); ccv_nnc_cudnn_deinit_tensor_view_descriptor(h); } - ccv_nnc_cudnn_deinit_tensor_view_descriptor(a); ccv_nnc_cudnn_deinit_tensor_view_descriptor(g); ccv_nnc_cudnn_deinit_convolution_descriptor(conv); int filter = -1, data = -1; @@ -409,7 +469,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN) { #ifdef HAVE_CUDNN registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC; - registry->tensor_datatypes = CCV_32F | CCV_16F; + registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_QX; registry->tensor_memory = CCV_TENSOR_GPU_MEMORY; registry->algorithms = CCV_NNC_CMD_CUDNN_CONV_FWD_ALGO_COUNT; registry->exec = _ccv_nnc_conv_forw; @@ -421,7 +481,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN { #ifdef HAVE_CUDNN registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC; - registry->tensor_datatypes = CCV_32F | CCV_16F; + registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_QX; registry->tensor_memory = CCV_TENSOR_GPU_MEMORY; registry->algorithms = CCV_NNC_CMD_CUDNN_CONV_BWD_DATA_ALGO_COUNT * CCV_NNC_CMD_CUDNN_CONV_BWD_FILTER_ALGO_COUNT; registry->exec = _ccv_nnc_conv_back; diff --git a/lib/nnc/cmd/util/gpu/ccv_nnc_util_gpu_ref.cu b/lib/nnc/cmd/util/gpu/ccv_nnc_util_gpu_ref.cu index 690d80063..002db6684 100644 --- a/lib/nnc/cmd/util/gpu/ccv_nnc_util_gpu_ref.cu +++ b/lib/nnc/cmd/util/gpu/ccv_nnc_util_gpu_ref.cu @@ -19,8 +19,11 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t assert(CCV_IS_TENSOR_CONTIGUOUS(a)); assert(CCV_IS_TENSOR_CONTIGUOUS(b)); assert(ccv_nnc_tensor_count(a->info) == ccv_nnc_tensor_count(b->info)); - assert(CCV_GET_DATA_TYPE_SIZE(a->info.datatype) == CCV_GET_DATA_TYPE_SIZE(b->info.datatype)); - const size_t size = (ssize_t)ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype); + if (CCV_GET_DATA_TYPE(a->info.datatype) == CCV_QX) + { assert(a->info.datatype == b->info.datatype); } + else + { assert(CCV_GET_DATA_TYPE_SIZE(a->info.datatype) == CCV_GET_DATA_TYPE_SIZE(b->info.datatype)); } + const size_t size = ccv_nnc_tensor_data_size_without_padding(a->info); if (stream_context) { cudaStream_t stream = ccv_nnc_stream_context_get_stream(stream_context); @@ -61,7 +64,7 @@ static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) { registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; - registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S; + registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_64S | CCV_32S | CCV_8U | CCV_QX; registry->tensor_memory = CCV_TENSOR_CPU_MEMORY | CCV_TENSOR_GPU_MEMORY; registry->algorithms = 1; registry->exec = _ccv_nnc_data_transfer; @@ -70,7 +73,7 @@ REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_GPU_REF) REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_GPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) { registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; - registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S; + registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_64S | CCV_32S | CCV_8U | CCV_QX; registry->tensor_memory = CCV_TENSOR_CPU_MEMORY | CCV_TENSOR_GPU_MEMORY; registry->algorithms = 1; registry->exec = _ccv_nnc_data_transfer; diff --git a/lib/nnc/gpu/ccv_nnc_compat.cu b/lib/nnc/gpu/ccv_nnc_compat.cu index ce9beaa67..ead844b4d 100644 --- a/lib/nnc/gpu/ccv_nnc_compat.cu +++ b/lib/nnc/gpu/ccv_nnc_compat.cu @@ -770,8 +770,10 @@ cudaDataType_t ccv_nnc_cuda_compute_datatype(const int datatype) #ifdef HAVE_CUDNN -cudnnDataType_t ccv_nnc_cudnn_datatype(const int datatype) +cudnnDataType_t ccv_nnc_cudnn_datatype(int datatype) { + if (CCV_GET_DATA_TYPE(datatype) == CCV_QX) + datatype = (datatype & 0xff) << 12; switch (datatype) { case CCV_8U: diff --git a/test/int/nnc/cudnn.tests.c b/test/int/nnc/cudnn.tests.c index 5c6e1829f..9e26521dd 100644 --- a/test/int/nnc/cudnn.tests.c +++ b/test/int/nnc/cudnn.tests.c @@ -185,7 +185,7 @@ TEST_CASE("cudnn forward convolution in half precision") cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN; assert(cmd.backend >= 0); cmd.algorithm = -1; - cmd = ccv_nnc_cmd_autotune(cmd, 1 * 1024 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context); + cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context); assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context)); ccv_nnc_stream_context_wait(stream_context); ccv_nnc_stream_context_free(stream_context); @@ -209,6 +209,74 @@ TEST_CASE("cudnn forward convolution in half precision") ccv_nnc_tensor_free(ga); } +TEST_CASE("cudnn forward convolution in half precision with palettize weights") +{ + GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_GPU_CUDNN)); + ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0); + ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0); + ccv_nnc_cmd_t cmd = CMD_CONVOLUTION_FORWARD(1, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM); + cmd.backend = CCV_NNC_BACKEND_CPU_REF; + assert(cmd.backend >= 0); + ccv_nnc_hint_t hint = ccv_nnc_hint_auto(cmd.info, a->info, b->info); + assert(ccv_nnc_hint_verify(hint, cmd.info, a->info, b->info) == 0); + ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM, KERNEL_SIZE, KERNEL_SIZE, INPUT_DIM), 0); + ccv_nnc_tensor_t* wo = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0); + ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, OUTPUT_DIM), 0); + // configure the inlets. + dsfmt_t dsfmt; + dsfmt_init_gen_rand(&dsfmt, 0); + int i; + for (i = 0; i < INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE * OUTPUT_DIM; i++) + w->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (INPUT_DIM * KERNEL_SIZE * KERNEL_SIZE); + for (i = 0; i < INPUT_SIZE * INPUT_SIZE * INPUT_DIM * ccv_max(1, BATCH_SIZE); i++) + a->data.f32[i] = dsfmt_genrand_open_close(&dsfmt); + for (i = 0; i < OUTPUT_DIM; i++) + bias->data.f32[i] = (float)i / OUTPUT_DIM; + ccv_nnc_cmd_exec(CMD_FORMAT_TRANSFORM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wo), 0); + ccv_nnc_tensor_t* a1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0); + ccv_nnc_tensor_t* w1o = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 0); + ccv_nnc_tensor_t* bias1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, OUTPUT_DIM), 0); + ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wo, bias), TENSOR_LIST(a1, w1o, bias1), 0); + ccv_nnc_tensor_t* pw1o = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(CPU_TENSOR_NCHW(16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0); + (void)ccv_nnc_palettize(w1o->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, ccv_nnc_tensor_count(w1o->info), 6, 1280, pw1o->data.u8, ccv_nnc_tensor_data_size_without_padding(pw1o->info)); + // Copy generated matrix values over to GPU. + ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, INPUT_DIM), 0); + ccv_nnc_tensor_t* gwo = ccv_nnc_tensor_new(0, ccv_nnc_tensor_palettize(GPU_TENSOR_NCHW(000, 16F, OUTPUT_DIM, INPUT_DIM, KERNEL_SIZE, KERNEL_SIZE), 6, 1280), 0); + ccv_nnc_tensor_t* gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, OUTPUT_DIM), 0); + ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a1, pw1o, bias1), TENSOR_LIST(ga, gwo, gbias), 0); + ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0); + ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0); + + ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU); + cmd.backend = CCV_NNC_BACKEND_GPU_CUDNN; + assert(cmd.backend >= 0); + cmd.algorithm = -1; + cmd = ccv_nnc_cmd_autotune(cmd, 512 * 1024 * 1024, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context); + assert(CCV_NNC_EXEC_SUCCESS == ccv_nnc_cmd_exec(cmd, hint, 0, TENSOR_LIST(ga, gwo, gbias), TENSOR_LIST(gc), stream_context)); + ccv_nnc_stream_context_wait(stream_context); + ccv_nnc_stream_context_free(stream_context); + ccv_nnc_tensor_t* c1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0); + ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c1), 0); + ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, BATCH_SIZE, OUTPUT_SIZE, OUTPUT_SIZE, OUTPUT_DIM), 0); + ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c1), TENSOR_LIST(c), 0); + REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, b->data.f32, c->data.f32, BATCH_SIZE * OUTPUT_DIM * OUTPUT_SIZE * OUTPUT_SIZE, 5e-3, "output from cudnn should match from CPU"); + ccv_nnc_tensor_free(c); + ccv_nnc_tensor_free(gc); + ccv_nnc_tensor_free(bias); + ccv_nnc_tensor_free(w); + ccv_nnc_tensor_free(wo); + ccv_nnc_tensor_free(b); + ccv_nnc_tensor_free(a); + ccv_nnc_tensor_free(c1); + ccv_nnc_tensor_free(bias1); + ccv_nnc_tensor_free(w1o); + ccv_nnc_tensor_free(pw1o); + ccv_nnc_tensor_free(a1); + ccv_nnc_tensor_free(gbias); + ccv_nnc_tensor_free(gwo); + ccv_nnc_tensor_free(ga); +} + TEST_CASE("cudnn backward convolution") { GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_GPU_CUDNN));