diff --git a/nntrainer/layers/cl_layers/concat_cl.cpp b/nntrainer/layers/cl_layers/concat_cl.cpp index 761cd5a02..ba1302966 100644 --- a/nntrainer/layers/cl_layers/concat_cl.cpp +++ b/nntrainer/layers/cl_layers/concat_cl.cpp @@ -536,35 +536,36 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis3_fp16( - const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_channels, - unsigned int input1_height, unsigned int input1_width, - unsigned int input2_width) { +void ConcatLayerCl::concat_cl_axis2(const float *matAdata, + const float *vecXdata, float *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_channels, + unsigned int input1_width, + unsigned int input1_height, + unsigned int input2_height) { bool result = false; do { - const auto &kernel_concat_ptr = - layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS3_FP16]; + const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2]; - int dim = int(input1_batch_size * input1_channels * input1_height * - (input1_width + input2_width)); + int dim = int(input1_batch_size * input1_channels * input1_width * + (input1_height + input2_height)); opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * + sizeof(float) * input1_batch_size * input1_channels * input1_height * input1_width, true, nullptr); opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * - input1_height * input2_width, + sizeof(float) * input1_batch_size * input1_channels * + input2_height * input1_width, true, nullptr); opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * - input1_height * (input1_width + input2_width), + sizeof(float) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); @@ -616,13 +617,13 @@ void ConcatLayerCl::concat_cl_axis3_fp16( } result = - kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); if (!result) { break; } result = - kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); if (!result) { break; } @@ -644,22 +645,21 @@ void ConcatLayerCl::concat_cl_axis3_fp16( } while (false); } -void ConcatLayerCl::concat_cl_axis2(const float *matAdata, +void ConcatLayerCl::concat_cl_axis1(const float *matAdata, const float *vecXdata, float *vecYdata, unsigned int input1_batch_size, - unsigned int input1_channels, - unsigned int input1_width, unsigned int input1_height, - unsigned int input2_height) { + unsigned int input1_width, + unsigned int input1_channels, + unsigned int input2_channels) { bool result = false; do { + const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS1]; - const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2]; - - int dim = int(input1_batch_size * input1_channels * input1_width * - (input1_height + input2_height)); + int dim = int(input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels)); opencl::Buffer inputA(cl_context_ref.context_inst_, sizeof(float) * input1_batch_size * input1_channels * @@ -667,13 +667,13 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, true, nullptr); opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input2_height * input1_width, + sizeof(float) * input1_batch_size * input2_channels * + input1_height * input1_width, true, nullptr); opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - (input1_height + input2_height) * input1_width, + sizeof(float) * input1_batch_size * input1_width * + input1_height * (input1_channels + input2_channels), true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); @@ -719,13 +719,13 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, } result = - kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int)); if (!result) { break; } result = - kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int)); if (!result) { break; } @@ -753,34 +753,38 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis2_fp16( - const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_channels, - unsigned int input1_width, unsigned int input1_height, - unsigned int input2_height) { +#ifdef ENABLE_FP16 +void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_channels, + unsigned int input1_height, + unsigned int input1_width, + unsigned int input2_width) { bool result = false; do { + const auto &kernel_concat_ptr = - layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2_FP16]; + layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS3_FP16]; - int dim = int(input1_batch_size * input1_channels * input1_width * - (input1_height + input2_height)); + int dim = int(input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width)); opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * + sizeof(_FP16) * input1_batch_size * input1_channels * input1_height * input1_width, true, nullptr); opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * - input2_height * input1_width, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * input2_width, true, nullptr); opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * - (input1_height + input2_height) * input1_width, + sizeof(_FP16) * input1_batch_size * input1_channels * + input1_height * (input1_width + input2_width), true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); @@ -832,13 +836,13 @@ void ConcatLayerCl::concat_cl_axis2_fp16( } result = - kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int)); if (!result) { break; } result = - kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int)); if (!result) { break; } @@ -860,35 +864,36 @@ void ConcatLayerCl::concat_cl_axis2_fp16( } while (false); } -void ConcatLayerCl::concat_cl_axis1(const float *matAdata, - const float *vecXdata, float *vecYdata, - unsigned int input1_batch_size, - unsigned int input1_height, - unsigned int input1_width, - unsigned int input1_channels, - unsigned int input2_channels) { +void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_channels, + unsigned int input1_width, + unsigned int input1_height, + unsigned int input2_height) { bool result = false; do { - const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS1]; + const auto &kernel_concat_ptr = + layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2_FP16]; - int dim = int(input1_batch_size * input1_width * input1_height * - (input1_channels + input2_channels)); + int dim = int(input1_batch_size * input1_channels * input1_width * + (input1_height + input2_height)); opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * + sizeof(_FP16) * input1_batch_size * input1_channels * input1_height * input1_width, true, nullptr); opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input2_channels * - input1_height * input1_width, + sizeof(_FP16) * input1_batch_size * input1_channels * + input2_height * input1_width, true, nullptr); opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_width * - input1_height * (input1_channels + input2_channels), + sizeof(_FP16) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); @@ -934,13 +939,13 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, } result = - kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int)); if (!result) { break; } result = - kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int)); + kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int)); if (!result) { break; } @@ -968,11 +973,13 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis1_fp16( - const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, - unsigned int input1_batch_size, unsigned int input1_height, - unsigned int input1_width, unsigned int input1_channels, - unsigned int input2_channels) { +void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata, + const _FP16 *vecXdata, _FP16 *vecYdata, + unsigned int input1_batch_size, + unsigned int input1_height, + unsigned int input1_width, + unsigned int input1_channels, + unsigned int input2_channels) { bool result = false; @@ -985,17 +992,17 @@ void ConcatLayerCl::concat_cl_axis1_fp16( (input1_channels + input2_channels)); opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_channels * + sizeof(_FP16) * input1_batch_size * input1_channels * input1_height * input1_width, true, nullptr); opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input2_channels * + sizeof(_FP16) * input1_batch_size * input2_channels * input1_height * input1_width, true, nullptr); opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(__fp16) * input1_batch_size * input1_width * + sizeof(_FP16) * input1_batch_size * input1_width * input1_height * (input1_channels + input2_channels), true, nullptr); @@ -1075,6 +1082,7 @@ void ConcatLayerCl::concat_cl_axis1_fp16( } while (false); } +#endif void ConcatLayerCl::calcDerivative(RunLayerContext &context) { // /**