From d69ee172b81b05dc61a1da423d8a26449f1530e0 Mon Sep 17 00:00:00 2001 From: Liu Liu Date: Tue, 29 Oct 2024 00:44:00 -0400 Subject: [PATCH] Support backward of cmul. --- lib/nnc/cmd/blas/ccv_nnc_blas.c | 2 +- lib/nnc/cmd/blas/mps/ccv_nnc_cmul_mps.m | 399 ++++++- lib/nnc/cmd/ccv_nnc_cmd.inc | 1386 ++++++++++++----------- lib/nnc/mfa/ccv_nnc_mfa_cmul.cpp | 1 + lib/nnc/mfa/ccv_nnc_mfa_cmul.hpp | 1 + lib/nnc/mfa/v2/CMulDescriptor.cpp | 4 +- lib/nnc/mfa/v2/CMulDescriptor.hpp | 9 +- lib/nnc/mfa/v2/CMulKernel.cpp | 132 ++- lib/nnc/mfa/v2/CMulKernel.hpp | 6 +- test/int/nnc/cublas.tests.c | 4 +- 10 files changed, 1231 insertions(+), 713 deletions(-) diff --git a/lib/nnc/cmd/blas/ccv_nnc_blas.c b/lib/nnc/cmd/blas/ccv_nnc_blas.c index 8ea3a01ea..64253405e 100644 --- a/lib/nnc/cmd/blas/ccv_nnc_blas.c +++ b/lib/nnc/cmd/blas/ccv_nnc_blas.c @@ -265,7 +265,7 @@ REGISTER_COMMAND(CCV_NNC_CMUL_FORWARD)(ccv_nnc_cmd_registry_t* const registry) } REGISTER_COMMAND(CCV_NNC_CMUL_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) - FIND_BACKEND(ccv_nnc_cmul_cpu_ref.c, gpu/ccv_nnc_cmul_gpu_ref.cu) + FIND_BACKEND(ccv_nnc_cmul_cpu_ref.c, gpu/ccv_nnc_cmul_gpu_ref.cu, mps/ccv_nnc_cmul_mps.m) { registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; registry->bitmask = _ccv_nnc_cmul_back_bitmask; diff --git a/lib/nnc/cmd/blas/mps/ccv_nnc_cmul_mps.m b/lib/nnc/cmd/blas/mps/ccv_nnc_cmul_mps.m index 70cf0ec6c..ad6aa83ef 100644 --- a/lib/nnc/cmd/blas/mps/ccv_nnc_cmul_mps.m +++ b/lib/nnc/cmd/blas/mps/ccv_nnc_cmul_mps.m @@ -15,7 +15,6 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint assert(output_size == 1); ccv_nnc_tensor_t* const c = outputs[0]; assert(CCV_IS_TENSOR_CONTIGUOUS(c)); - const size_t count = ccv_nnc_tensor_count(c->info) / 2; @autoreleasepool { bool use_mfa = true; const char *fallback_reason = NULL; @@ -64,6 +63,7 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint } if (use_mfa) { ccv_nnc_mfa_cmul_params_t params = { + .conjugate = 0, .data_type = mtl_data_type, .astride = {0, 0, 0}, .bstride = {0, 0, 0}, @@ -159,8 +159,8 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint ccv_nnc_mps_graph_key_t key = ccv_nnc_mps_graph_key_new(cmd, 0, hint, flags, inputs, input_size, outputs, output_size); int indices[2]; int nd = ccv_nnc_tensor_nd(a->info.dim); - assert(nd = ccv_nnc_tensor_nd(b->info.dim)); - assert(nd = ccv_nnc_tensor_nd(c->info.dim)); + assert(nd == ccv_nnc_tensor_nd(b->info.dim)); + assert(nd == ccv_nnc_tensor_nd(c->info.dim)); MPSGraphExecutable* executable = ccv_nnc_mps_graph_executable_cache(key, indices, ^void (MPSGraph* graph, NSMutableArray* inputTensors, NSMutableArray* inputShapedTypes, NSMutableArray* resultTensors) { MPSGraphTensor* mps_input_a; MPSGraphTensor* mps_a = ccv_nnc_mps_graph_tensor_input(graph, a, a->info.dim, a->stride, &mps_input_a); @@ -209,6 +209,390 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint return CCV_NNC_EXEC_SUCCESS; } +static int _ccv_nnc_cmul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) +{ + int gdim[CCV_NNC_MAX_DIM_ALLOC]; + int no_broadcasting = 1; + if (outputs[0]) + { + assert(input_size >= 3 && inputs[2]); + ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim); + ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim); + no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim)); + } + if (no_broadcasting && output_size > 1 && outputs[1]) + { + assert(inputs[1]); + ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim); + ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim); + no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim)); + } + if (!no_broadcasting) + return CCV_NNC_EXEC_INVALID; + // We only support no broadcast syntax due to atomic / aggregation required for broadcast syntax. + const ccv_nnc_tensor_t* const g = inputs[0]; + if (!g) + return CCV_NNC_EXEC_INVALID; + assert(!g || CCV_IS_TENSOR_CONTIGUOUS(g)); + const ccv_nnc_tensor_t* const a = input_size >= 2 ? inputs[1] : 0; + assert(!a || CCV_IS_TENSOR_CONTIGUOUS(a)); + ccv_nnc_tensor_t* const b = input_size >= 3 ? inputs[2] : 0; + assert(!b || CCV_IS_TENSOR_CONTIGUOUS(b)); + ccv_nnc_tensor_t* const c = outputs[0]; + assert(!c || CCV_IS_TENSOR_CONTIGUOUS(c)); + ccv_nnc_tensor_t* const d = output_size >= 2 ? outputs[1] : 0; + assert(!d || CCV_IS_TENSOR_CONTIGUOUS(d)); + @autoreleasepool { + bool use_mfa = true; + const char *fallback_reason = NULL; + ccv_nnc_mfa_context_t* context = ccv_nnc_default_mfa_context(); + + if (!ccv_nnc_mfa_context_supported(context) || (ccv_nnc_flags() & CCV_NNC_DISABLE_METAL_FLASH_ATTENTION)) { + use_mfa = false; + fallback_reason = "Disabled."; + } + + uint32_t mtl_data_type = UINT32_MAX; + if (use_mfa) { + const int is_same_dtype = + (!g || !a || g->info.datatype == a->info.datatype) && + (!a || !b || a->info.datatype == b->info.datatype) && + (!g || !b || g->info.datatype == b->info.datatype) && + (!a || !d || a->info.datatype == d->info.datatype) && + (!b || !c || b->info.datatype == c->info.datatype); + if (!is_same_dtype) { + use_mfa = false; + fallback_reason = "Mixed precision."; + } + + int datatype = 0; + if (a) + datatype = a->info.datatype; + else if (b) + datatype = b->info.datatype; + switch (datatype) { + case CCV_16F: { + mtl_data_type = 16; + break; + } + case CCV_32F: { + mtl_data_type = 3; + break; + } + default: { + use_mfa = false; + fallback_reason = "Unsupported data type."; + break; + } + } + } + + if (use_mfa) { + if ((a && !CCV_IS_TENSOR_CONTIGUOUS(a)) || + (b && !CCV_IS_TENSOR_CONTIGUOUS(b)) || + (c && !CCV_IS_TENSOR_CONTIGUOUS(c)) || + (d && !CCV_IS_TENSOR_CONTIGUOUS(d)) || + (g && !CCV_IS_TENSOR_CONTIGUOUS(g))) + { + use_mfa = false; + fallback_reason = "Strided."; + } + } + if (use_mfa) { + ccv_nnc_mfa_cmul_params_t params = { + .conjugate = 1, + .data_type = mtl_data_type, + .astride = {0, 0, 0}, + .bstride = {0, 0, 0}, + .cstride = {0, 0, 0}, + .dim = {0, 0, 0, 0} + }; + mtl_command_batch_t* command_batch = ccv_nnc_stream_context_start_command_batch(stream_context); + if (g) + { + if (b && c) + { + const size_t count = ccv_nnc_tensor_count(c->info); + if (ccv_nnc_tensor_count(g->info) == count && ccv_nnc_tensor_count(b->info) == count) { + params.dim[0] = count; + } else { + int i; + int nd = ccv_nnc_tensor_nd(g->info.dim); + assert(nd = ccv_nnc_tensor_nd(b->info.dim)); + assert(nd = ccv_nnc_tensor_nd(c->info.dim)); + int adim[CCV_NNC_MAX_DIM_ALLOC]; + int bdim[CCV_NNC_MAX_DIM_ALLOC]; + int cdim[CCV_NNC_MAX_DIM_ALLOC]; + int squeezed_dims = 0; + for (i = nd - 1; i >= 0; i--) + { + if (c->info.dim[i] == 1) + continue; + adim[squeezed_dims] = g->info.dim[i]; + bdim[squeezed_dims] = b->info.dim[i]; + cdim[squeezed_dims] = c->info.dim[i]; + squeezed_dims += 1; + } + nd = squeezed_dims; + int astride[CCV_NNC_MAX_DIM_ALLOC]; + int bstride[CCV_NNC_MAX_DIM_ALLOC]; + int cstride[CCV_NNC_MAX_DIM_ALLOC]; + astride[0] = 1; + bstride[0] = 1; + cstride[0] = 1; + for (i = 1; i < nd; i++) + { + astride[i] = adim[i - 1] * astride[i - 1]; + bstride[i] = bdim[i - 1] * bstride[i - 1]; + cstride[i] = cdim[i - 1] * cstride[i - 1]; + } + for (i = 0; i < nd; i++) + { + if (cdim[i] == adim[i] && cdim[i] == bdim[i]) + continue; + if (cdim[i] == adim[i]) + { + assert(bdim[i] == 1); + bstride[i] = 0; + } else { + assert(cdim[i] == bdim[i]); + assert(adim[i] == 1); + astride[i] = 0; + } + } + assert(nd <= 4); + params.dim[0] = cdim[0]; + params.dim[1] = cdim[1]; + params.dim[2] = cdim[2]; + params.dim[3] = cdim[3]; + for (i = nd; i < 4; i++) + params.dim[i] = 0; + params.astride[0] = astride[1]; + params.astride[1] = astride[2]; + params.astride[2] = astride[3]; + params.bstride[0] = bstride[1]; + params.bstride[1] = bstride[2]; + params.bstride[2] = bstride[3]; + params.cstride[0] = cstride[1]; + params.cstride[1] = cstride[2]; + params.cstride[2] = cstride[3]; + } + ccv_nnc_mfa_prepare_cmul(context, params); + + mtl_buffer_t* tensors[4] = { + mpgetbuffer(g), // gradient + mpgetbuffer(b), // source + mpgetbuffer(c), // destination + NULL, + }; + size_t tensor_offsets[3] = { + g->dataof, + b->dataof, + c->dataof + }; + ccv_nnc_mfa_encode_cmul(context, params, command_batch, tensors, tensor_offsets); + } + if (a && d) + { + const size_t count = ccv_nnc_tensor_count(d->info); + if (ccv_nnc_tensor_count(g->info) == count && ccv_nnc_tensor_count(a->info) == count) { + params.dim[0] = count; + } else { + int i; + int nd = ccv_nnc_tensor_nd(g->info.dim); + assert(nd = ccv_nnc_tensor_nd(a->info.dim)); + assert(nd = ccv_nnc_tensor_nd(d->info.dim)); + int adim[CCV_NNC_MAX_DIM_ALLOC]; + int bdim[CCV_NNC_MAX_DIM_ALLOC]; + int cdim[CCV_NNC_MAX_DIM_ALLOC]; + int squeezed_dims = 0; + for (i = nd - 1; i >= 0; i--) + { + if (c->info.dim[i] == 1) + continue; + adim[squeezed_dims] = g->info.dim[i]; + bdim[squeezed_dims] = a->info.dim[i]; + cdim[squeezed_dims] = d->info.dim[i]; + squeezed_dims += 1; + } + nd = squeezed_dims; + int astride[CCV_NNC_MAX_DIM_ALLOC]; + int bstride[CCV_NNC_MAX_DIM_ALLOC]; + int cstride[CCV_NNC_MAX_DIM_ALLOC]; + astride[0] = 1; + bstride[0] = 1; + cstride[0] = 1; + for (i = 1; i < nd; i++) + { + astride[i] = adim[i - 1] * astride[i - 1]; + bstride[i] = bdim[i - 1] * bstride[i - 1]; + cstride[i] = cdim[i - 1] * cstride[i - 1]; + } + for (i = 0; i < nd; i++) + { + if (cdim[i] == adim[i] && cdim[i] == bdim[i]) + continue; + if (cdim[i] == adim[i]) + { + assert(bdim[i] == 1); + bstride[i] = 0; + } else { + assert(cdim[i] == bdim[i]); + assert(adim[i] == 1); + astride[i] = 0; + } + } + assert(nd <= 4); + params.dim[0] = cdim[0]; + params.dim[1] = cdim[1]; + params.dim[2] = cdim[2]; + params.dim[3] = cdim[3]; + for (i = nd; i < 4; i++) + params.dim[i] = 0; + params.astride[0] = astride[1]; + params.astride[1] = astride[2]; + params.astride[2] = astride[3]; + params.bstride[0] = bstride[1]; + params.bstride[1] = bstride[2]; + params.bstride[2] = bstride[3]; + params.cstride[0] = cstride[1]; + params.cstride[1] = cstride[2]; + params.cstride[2] = cstride[3]; + } + ccv_nnc_mfa_prepare_cmul(context, params); + + mtl_buffer_t* tensors[4] = { + mpgetbuffer(g), // gradient + mpgetbuffer(a), // source + mpgetbuffer(d), // destination + NULL, + }; + size_t tensor_offsets[3] = { + g->dataof, + a->dataof, + d->dataof + }; + ccv_nnc_mfa_encode_cmul(context, params, command_batch, tensors, tensor_offsets); + } + } + ccv_nnc_stream_context_finish_command_batch(stream_context, command_batch); + } else { + MPSCommandBuffer* command_buffer = ccv_nnc_stream_context_start_mps_command_buffer(stream_context); + if (g) + { + if (b && c) + { + const ccv_nnc_tensor_view_t* const g = (const ccv_nnc_tensor_view_t*)inputs[0]; + const ccv_nnc_tensor_view_t* const b = (const ccv_nnc_tensor_view_t*)inputs[2]; + ccv_nnc_tensor_view_t* const c = (ccv_nnc_tensor_view_t*)outputs[0]; + ccv_nnc_mps_graph_key_t key = ccv_nnc_mps_graph_key_new(cmd, 0, hint, flags, inputs, input_size, outputs, output_size); + int indices[2]; + int nd = ccv_nnc_tensor_nd(g->info.dim); + assert(nd == ccv_nnc_tensor_nd(b->info.dim)); + assert(nd == ccv_nnc_tensor_nd(c->info.dim)); + MPSGraphExecutable* executable = ccv_nnc_mps_graph_executable_cache(key, indices, ^void (MPSGraph* graph, NSMutableArray* inputTensors, NSMutableArray* inputShapedTypes, NSMutableArray* resultTensors) { + MPSGraphTensor* mps_input_a; + MPSGraphTensor* mps_a = ccv_nnc_mps_graph_tensor_input(graph, g, g->info.dim, g->stride, &mps_input_a); + [inputTensors addObject:mps_input_a]; + MPSGraphShapedType* mps_a_shape = ccv_nnc_mps_graph_tensor_input_shape(g, g->info.dim, g->stride); + [inputShapedTypes addObject:mps_a_shape]; + MPSGraphTensor* mps_input_b; + MPSGraphTensor* mps_b = ccv_nnc_mps_graph_tensor_input(graph, b, b->info.dim, b->stride, &mps_input_b); + [inputTensors addObject:mps_input_b]; + MPSGraphShapedType* mps_b_shape = ccv_nnc_mps_graph_tensor_input_shape(b, b->info.dim, b->stride); + [inputShapedTypes addObject:mps_b_shape]; + int i; + // Reshape to [..., n / 2, 2] + NSMutableArray* a_shape = [NSMutableArray new]; + for (i = 0; i < nd - 1; i++) + [a_shape addObject:@(g->info.dim[i])]; + [a_shape addObject: @(g->info.dim[nd - 1] / 2)]; + [a_shape addObject: @2]; + mps_a = [graph reshapeTensor:mps_a withShape:a_shape name:nil]; + [a_shape release]; + NSArray* mps_a_splits = [graph splitTensor:mps_a numSplits:2 axis:nd name:nil]; + NSMutableArray* b_shape = [NSMutableArray new]; + for (i = 0; i < nd - 1; i++) + [b_shape addObject:@(b->info.dim[i])]; + [b_shape addObject: @(b->info.dim[nd - 1] / 2)]; + [b_shape addObject: @2]; + mps_b = [graph reshapeTensor:mps_b withShape:b_shape name:nil]; + [b_shape release]; + NSArray* mps_b_splits = [graph splitTensor:mps_b numSplits:2 axis:nd name:nil]; + MPSGraphTensor* mps_c_0 = [graph additionWithPrimaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[0] secondaryTensor:mps_b_splits[0] name:nil] secondaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[1] secondaryTensor:mps_b_splits[1] name:nil] name:nil]; + MPSGraphTensor* mps_c_1 = [graph subtractionWithPrimaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[1] secondaryTensor:mps_b_splits[0] name:nil] secondaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[0] secondaryTensor:mps_b_splits[1] name:nil] name:nil]; + NSMutableArray* c_shape = [NSMutableArray new]; + for (i = 0; i < nd; i++) + [c_shape addObject:@(c->info.dim[i])]; + MPSGraphTensor* mps_c = [graph reshapeTensor:[graph concatTensor:mps_c_0 withTensor:mps_c_1 dimension:nd name:nil] withShape:c_shape name:nil]; + [resultTensors addObject:mps_c]; + [c_shape release]; + }); + MPSGraphTensorData* data_a = ccv_nnc_mps_graph_tensor_data(g, g->info.dim, g->stride); + MPSGraphTensorData* data_b = ccv_nnc_mps_graph_tensor_data(b, b->info.dim, b->stride); + MPSGraphTensorData* data[] = {data_a, data_b}; + ccv_nnc_mps_graph_executable_result(executable, command_buffer, @[data[indices[0]], data[indices[1]]], &c, (int*[]){ c->info.dim }, (int*[]){ c->stride }, 1, 0); + } + if (a && d) + { + const ccv_nnc_tensor_view_t* const g = (const ccv_nnc_tensor_view_t*)inputs[0]; + const ccv_nnc_tensor_view_t* const a = (const ccv_nnc_tensor_view_t*)inputs[1]; + ccv_nnc_tensor_view_t* const d = (ccv_nnc_tensor_view_t*)outputs[1]; + ccv_nnc_mps_graph_key_t key = ccv_nnc_mps_graph_key_new(cmd, 1, hint, flags, inputs, input_size, outputs, output_size); + int indices[2]; + int nd = ccv_nnc_tensor_nd(g->info.dim); + assert(nd == ccv_nnc_tensor_nd(a->info.dim)); + assert(nd == ccv_nnc_tensor_nd(d->info.dim)); + MPSGraphExecutable* executable = ccv_nnc_mps_graph_executable_cache(key, indices, ^void (MPSGraph* graph, NSMutableArray* inputTensors, NSMutableArray* inputShapedTypes, NSMutableArray* resultTensors) { + MPSGraphTensor* mps_input_a; + MPSGraphTensor* mps_a = ccv_nnc_mps_graph_tensor_input(graph, g, g->info.dim, g->stride, &mps_input_a); + [inputTensors addObject:mps_input_a]; + MPSGraphShapedType* mps_a_shape = ccv_nnc_mps_graph_tensor_input_shape(g, g->info.dim, g->stride); + [inputShapedTypes addObject:mps_a_shape]; + MPSGraphTensor* mps_input_b; + MPSGraphTensor* mps_b = ccv_nnc_mps_graph_tensor_input(graph, a, a->info.dim, a->stride, &mps_input_b); + [inputTensors addObject:mps_input_b]; + MPSGraphShapedType* mps_b_shape = ccv_nnc_mps_graph_tensor_input_shape(a, a->info.dim, a->stride); + [inputShapedTypes addObject:mps_b_shape]; + int i; + // Reshape to [..., n / 2, 2] + NSMutableArray* a_shape = [NSMutableArray new]; + for (i = 0; i < nd - 1; i++) + [a_shape addObject:@(g->info.dim[i])]; + [a_shape addObject: @(g->info.dim[nd - 1] / 2)]; + [a_shape addObject: @2]; + mps_a = [graph reshapeTensor:mps_a withShape:a_shape name:nil]; + [a_shape release]; + NSArray* mps_a_splits = [graph splitTensor:mps_a numSplits:2 axis:nd name:nil]; + NSMutableArray* b_shape = [NSMutableArray new]; + for (i = 0; i < nd - 1; i++) + [b_shape addObject:@(a->info.dim[i])]; + [b_shape addObject: @(a->info.dim[nd - 1] / 2)]; + [b_shape addObject: @2]; + mps_b = [graph reshapeTensor:mps_b withShape:b_shape name:nil]; + [b_shape release]; + NSArray* mps_b_splits = [graph splitTensor:mps_b numSplits:2 axis:nd name:nil]; + MPSGraphTensor* mps_c_0 = [graph additionWithPrimaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[0] secondaryTensor:mps_b_splits[0] name:nil] secondaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[1] secondaryTensor:mps_b_splits[1] name:nil] name:nil]; + MPSGraphTensor* mps_c_1 = [graph subtractionWithPrimaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[1] secondaryTensor:mps_b_splits[0] name:nil] secondaryTensor:[graph multiplicationWithPrimaryTensor:mps_a_splits[0] secondaryTensor:mps_b_splits[1] name:nil] name:nil]; + NSMutableArray* c_shape = [NSMutableArray new]; + for (i = 0; i < nd; i++) + [c_shape addObject:@(d->info.dim[i])]; + MPSGraphTensor* mps_c = [graph reshapeTensor:[graph concatTensor:mps_c_0 withTensor:mps_c_1 dimension:nd name:nil] withShape:c_shape name:nil]; + [resultTensors addObject:mps_c]; + [c_shape release]; + }); + MPSGraphTensorData* data_a = ccv_nnc_mps_graph_tensor_data(g, g->info.dim, g->stride); + MPSGraphTensorData* data_b = ccv_nnc_mps_graph_tensor_data(a, a->info.dim, a->stride); + MPSGraphTensorData* data[] = {data_a, data_b}; + ccv_nnc_mps_graph_executable_result(executable, command_buffer, @[data[indices[0]], data[indices[1]]], &d, (int*[]){ d->info.dim }, (int*[]){ d->stride }, 1, 0); + } + } + ccv_nnc_stream_context_finish_mps_command_buffer(stream_context, command_buffer); + } + } + return CCV_NNC_EXEC_SUCCESS; +} + REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS)(ccv_nnc_cmd_backend_registry_t* const registry) { registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; @@ -217,3 +601,12 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint registry->algorithms = 1; registry->exec = _ccv_nnc_cmul_forw; } + +REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS)(ccv_nnc_cmd_backend_registry_t* const registry) +{ + registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; + registry->tensor_datatypes = CCV_32F | CCV_16F; + registry->tensor_memory = CCV_TENSOR_GPU_MEMORY; + registry->algorithms = 1; + registry->exec = _ccv_nnc_cmul_back; +} diff --git a/lib/nnc/cmd/ccv_nnc_cmd.inc b/lib/nnc/cmd/ccv_nnc_cmd.inc index 569adc939..9f62afa2f 100644 --- a/lib/nnc/cmd/ccv_nnc_cmd.inc +++ b/lib/nnc/cmd/ccv_nnc_cmd.inc @@ -1,142 +1,142 @@ static ccv_nnc_cmd_init_t init_map[] = { {.name = "CCV_NNC_BATCH_NORM_FORWARD", .cmd = 0x5419819c}, {.name = "CCV_NNC_BATCH_NORM_BACKWARD", .cmd = 0x5419819d}, - {.name = "CCV_NNC_COMM_ALLREDUCE_FORWARD", .cmd = 0x75c8d340}, - {.name = "CCV_NNC_COMM_ALLREDUCE_BACKWARD", .cmd = 0x75c8d341}, - {.name = "CCV_NNC_ARGMIN_FORWARD", .cmd = 0xeb8747f2}, - {.name = "CCV_NNC_ARGMIN_BACKWARD", .cmd = 0xeb8747f3}, - {.name = "CCV_NNC_REDUCE_MEAN_FORWARD", .cmd = 0xf23556c6}, - {.name = "CCV_NNC_REDUCE_MEAN_BACKWARD", .cmd = 0xf23556c7}, - {.name = "CCV_NNC_REDUCE_MIN_FORWARD", .cmd = 0x6785ef96}, - {.name = "CCV_NNC_REDUCE_MIN_BACKWARD", .cmd = 0x6785ef97}, {.name = "CCV_NNC_LSTM_FORWARD", .cmd = 0xc5cb998c}, {.name = "CCV_NNC_LSTM_BACKWARD", .cmd = 0xc5cb998d}, + {.name = "CCV_NNC_SET_FORWARD", .cmd = 0x2b070804}, + {.name = "CCV_NNC_SET_BACKWARD", .cmd = 0x2b070805}, + {.name = "CCV_NNC_DATA_TRANSFER_FORWARD", .cmd = 0x12d21e1a}, + {.name = "CCV_NNC_DATA_TRANSFER_BACKWARD", .cmd = 0x12d21e1b}, + {.name = "CCV_NNC_HISTOGRAM_FORWARD", .cmd = 0xc5473e44}, + {.name = "CCV_NNC_HISTOGRAM_BACKWARD", .cmd = 0xc5473e45}, {.name = "CCV_NNC_REDUCE_MAX_FORWARD", .cmd = 0x80f1a506}, {.name = "CCV_NNC_REDUCE_MAX_BACKWARD", .cmd = 0x80f1a507}, - {.name = "CCV_NNC_CMUL_FORWARD", .cmd = 0xead486e6}, - {.name = "CCV_NNC_CMUL_BACKWARD", .cmd = 0xead486e7}, - {.name = "CCV_NNC_FORMAT_TRANSFORM_FORWARD", .cmd = 0xe4a2b192}, - {.name = "CCV_NNC_FORMAT_TRANSFORM_BACKWARD", .cmd = 0xe4a2b193}, + {.name = "CCV_NNC_PAD_FORWARD", .cmd = 0xd8aaca60}, + {.name = "CCV_NNC_PAD_BACKWARD", .cmd = 0xd8aaca61}, + {.name = "CCV_NNC_EWPROD_FORWARD", .cmd = 0xee07e8fe}, + {.name = "CCV_NNC_EWPROD_BACKWARD", .cmd = 0xee07e8ff}, + {.name = "CCV_NNC_SGD_FORWARD", .cmd = 0xe650ad26}, + {.name = "CCV_NNC_SGD_BACKWARD", .cmd = 0xe650ad27}, + {.name = "CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD", .cmd = 0x284ed926}, + {.name = "CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD", .cmd = 0x284ed927}, + {.name = "CCV_NNC_RELU_FORWARD", .cmd = 0xc51eaa80}, + {.name = "CCV_NNC_RELU_BACKWARD", .cmd = 0xc51eaa81}, + {.name = "CCV_NNC_NMS_FORWARD", .cmd = 0xdba26106}, + {.name = "CCV_NNC_NMS_BACKWARD", .cmd = 0xdba26107}, + {.name = "CCV_NNC_DROPOUT_FORWARD", .cmd = 0x7f2dc3e4}, + {.name = "CCV_NNC_DROPOUT_BACKWARD", .cmd = 0x7f2dc3e5}, + {.name = "CCV_NNC_EWSQRT_FORWARD", .cmd = 0x8870a61e}, + {.name = "CCV_NNC_EWSQRT_BACKWARD", .cmd = 0x8870a61f}, + {.name = "CCV_NNC_TANH_FORWARD", .cmd = 0x6a62be30}, + {.name = "CCV_NNC_TANH_BACKWARD", .cmd = 0x6a62be31}, + {.name = "CCV_NNC_REDUCE_ISNAN_FORWARD", .cmd = 0xee0a4ade}, + {.name = "CCV_NNC_REDUCE_ISNAN_BACKWARD", .cmd = 0xee0a4adf}, {.name = "CCV_NNC_DATATYPE_CONVERSION_FORWARD", .cmd = 0xd873e38c}, {.name = "CCV_NNC_DATATYPE_CONVERSION_BACKWARD", .cmd = 0xd873e38d}, + {.name = "CCV_NNC_MIN_FORWARD", .cmd = 0x972fbd26}, + {.name = "CCV_NNC_MIN_BACKWARD", .cmd = 0x972fbd27}, + {.name = "CCV_NNC_ADAMW_FORWARD", .cmd = 0x4f5d4870}, + {.name = "CCV_NNC_ADAMW_BACKWARD", .cmd = 0x4f5d4871}, + {.name = "CCV_NNC_COMM_ALLREDUCE_FORWARD", .cmd = 0x75c8d340}, + {.name = "CCV_NNC_COMM_ALLREDUCE_BACKWARD", .cmd = 0x75c8d341}, + {.name = "CCV_NNC_INDEX_SELECT_FORWARD", .cmd = 0x7ee7771e}, + {.name = "CCV_NNC_INDEX_SELECT_BACKWARD", .cmd = 0x7ee7771f}, + {.name = "CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD", .cmd = 0xc26b7b5e}, + {.name = "CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD", .cmd = 0xc26b7b5f}, + {.name = "CCV_NNC_MSE_FORWARD", .cmd = 0x6904a9a2}, + {.name = "CCV_NNC_MSE_BACKWARD", .cmd = 0x6904a9a3}, + {.name = "CCV_NNC_CLAMP_FORWARD", .cmd = 0x2640d854}, + {.name = "CCV_NNC_CLAMP_BACKWARD", .cmd = 0x2640d855}, + {.name = "CCV_NNC_UPSAMPLE_FORWARD", .cmd = 0x73875556}, + {.name = "CCV_NNC_UPSAMPLE_BACKWARD", .cmd = 0x73875557}, + {.name = "CCV_NNC_SOFTMAX_FORWARD", .cmd = 0xc969a252}, + {.name = "CCV_NNC_SOFTMAX_BACKWARD", .cmd = 0xc969a253}, + {.name = "CCV_NNC_TRANSPOSE_FORWARD", .cmd = 0xb4d506e0}, + {.name = "CCV_NNC_TRANSPOSE_BACKWARD", .cmd = 0xb4d506e1}, + {.name = "CCV_NNC_REDUCE_NORM2_FORWARD", .cmd = 0xb3034e16}, + {.name = "CCV_NNC_REDUCE_NORM2_BACKWARD", .cmd = 0xb3034e17}, {.name = "CCV_NNC_RANDOM_NORMAL_FORWARD", .cmd = 0x7062c8b4}, {.name = "CCV_NNC_RANDOM_NORMAL_BACKWARD", .cmd = 0x7062c8b5}, - {.name = "CCV_NNC_EWSUM_FORWARD", .cmd = 0xe21a2c4c}, - {.name = "CCV_NNC_EWSUM_BACKWARD", .cmd = 0xe21a2c4d}, - {.name = "CCV_NNC_SCALAR_MUL_FORWARD", .cmd = 0x8b4d86aa}, - {.name = "CCV_NNC_SCALAR_MUL_BACKWARD", .cmd = 0x8b4d86ab}, - {.name = "CCV_NNC_SET_FORWARD", .cmd = 0x2b070804}, - {.name = "CCV_NNC_SET_BACKWARD", .cmd = 0x2b070805}, - {.name = "CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD", .cmd = 0x284ed926}, - {.name = "CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD", .cmd = 0x284ed927}, - {.name = "CCV_NNC_LAYER_NORM_FORWARD", .cmd = 0xbed3c264}, - {.name = "CCV_NNC_LAYER_NORM_BACKWARD", .cmd = 0xbed3c265}, - {.name = "CCV_NNC_SWISH_FORWARD", .cmd = 0x583d90c2}, - {.name = "CCV_NNC_SWISH_BACKWARD", .cmd = 0x583d90c3}, + {.name = "CCV_NNC_RANDOM_UNIFORM_FORWARD", .cmd = 0xa0cd1d5e}, + {.name = "CCV_NNC_RANDOM_UNIFORM_BACKWARD", .cmd = 0xa0cd1d5f}, + {.name = "CCV_NNC_GEMM_FORWARD", .cmd = 0x7e87d00c}, + {.name = "CCV_NNC_GEMM_BACKWARD", .cmd = 0x7e87d00d}, + {.name = "CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD", .cmd = 0xd691f78e}, + {.name = "CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD", .cmd = 0xd691f78f}, + {.name = "CCV_NNC_MAX_FORWARD", .cmd = 0xdf6f014c}, + {.name = "CCV_NNC_MAX_BACKWARD", .cmd = 0xdf6f014d}, + {.name = "CCV_NNC_LAMB_FORWARD", .cmd = 0x450edb1a}, + {.name = "CCV_NNC_LAMB_BACKWARD", .cmd = 0x450edb1b}, + {.name = "CCV_NNC_REDUCE_SUM_FORWARD", .cmd = 0x52970f06}, + {.name = "CCV_NNC_REDUCE_SUM_BACKWARD", .cmd = 0x52970f07}, + {.name = "CCV_NNC_EWEXP_FORWARD", .cmd = 0xd784b170}, + {.name = "CCV_NNC_EWEXP_BACKWARD", .cmd = 0xd784b171}, {.name = "CCV_NNC_ROI_ALIGN_FORWARD", .cmd = 0xfef55168}, {.name = "CCV_NNC_ROI_ALIGN_BACKWARD", .cmd = 0xfef55169}, - {.name = "CCV_NNC_TRANSPOSE_FORWARD", .cmd = 0xb4d506e0}, - {.name = "CCV_NNC_TRANSPOSE_BACKWARD", .cmd = 0xb4d506e1}, - {.name = "CCV_NNC_EWLOG_FORWARD", .cmd = 0xf4191bf2}, - {.name = "CCV_NNC_EWLOG_BACKWARD", .cmd = 0xf4191bf3}, + {.name = "CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD", .cmd = 0xd9e0e4a}, + {.name = "CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD", .cmd = 0xd9e0e4b}, {.name = "CCV_NNC_MASKED_FILL_FORWARD", .cmd = 0x7f992d84}, {.name = "CCV_NNC_MASKED_FILL_BACKWARD", .cmd = 0x7f992d85}, + {.name = "CCV_NNC_BINARY_CROSSENTROPY_FORWARD", .cmd = 0xcd2107ec}, + {.name = "CCV_NNC_BINARY_CROSSENTROPY_BACKWARD", .cmd = 0xcd2107ed}, {.name = "CCV_NNC_RMSPROP_FORWARD", .cmd = 0x9c886b1c}, {.name = "CCV_NNC_RMSPROP_BACKWARD", .cmd = 0x9c886b1d}, - {.name = "CCV_NNC_SGD_FORWARD", .cmd = 0xe650ad26}, - {.name = "CCV_NNC_SGD_BACKWARD", .cmd = 0xe650ad27}, + {.name = "CCV_NNC_ARGMAX_FORWARD", .cmd = 0x68af2804}, + {.name = "CCV_NNC_ARGMAX_BACKWARD", .cmd = 0x68af2805}, + {.name = "CCV_NNC_EWDIV_FORWARD", .cmd = 0x1cd2fa18}, + {.name = "CCV_NNC_EWDIV_BACKWARD", .cmd = 0x1cd2fa19}, + {.name = "CCV_NNC_RMSNORM_FORWARD", .cmd = 0x6889e9d0}, + {.name = "CCV_NNC_RMSNORM_BACKWARD", .cmd = 0x6889e9d1}, + {.name = "CCV_NNC_GELU_FORWARD", .cmd = 0xb1527ab8}, + {.name = "CCV_NNC_GELU_BACKWARD", .cmd = 0xb1527ab9}, + {.name = "CCV_NNC_SWISH_FORWARD", .cmd = 0x583d90c2}, + {.name = "CCV_NNC_SWISH_BACKWARD", .cmd = 0x583d90c3}, + {.name = "CCV_NNC_MAX_POOL_FORWARD", .cmd = 0x7bec9360}, + {.name = "CCV_NNC_MAX_POOL_BACKWARD", .cmd = 0x7bec9361}, + {.name = "CCV_NNC_SMOOTH_L1_FORWARD", .cmd = 0x4e428e}, + {.name = "CCV_NNC_SMOOTH_L1_BACKWARD", .cmd = 0x4e428f}, + {.name = "CCV_NNC_SIGMOID_FORWARD", .cmd = 0xf2f69650}, + {.name = "CCV_NNC_SIGMOID_BACKWARD", .cmd = 0xf2f69651}, + {.name = "CCV_NNC_FORMAT_TRANSFORM_FORWARD", .cmd = 0xe4a2b192}, + {.name = "CCV_NNC_FORMAT_TRANSFORM_BACKWARD", .cmd = 0xe4a2b193}, {.name = "CCV_NNC_GROUP_NORM_FORWARD", .cmd = 0x17deb074}, {.name = "CCV_NNC_GROUP_NORM_BACKWARD", .cmd = 0x17deb075}, - {.name = "CCV_NNC_RANDOM_UNIFORM_FORWARD", .cmd = 0xa0cd1d5e}, - {.name = "CCV_NNC_RANDOM_UNIFORM_BACKWARD", .cmd = 0xa0cd1d5f}, - {.name = "CCV_NNC_INDEX_SELECT_FORWARD", .cmd = 0x7ee7771e}, - {.name = "CCV_NNC_INDEX_SELECT_BACKWARD", .cmd = 0x7ee7771f}, + {.name = "CCV_NNC_REDUCE_MEAN_FORWARD", .cmd = 0xf23556c6}, + {.name = "CCV_NNC_REDUCE_MEAN_BACKWARD", .cmd = 0xf23556c7}, {.name = "CCV_NNC_AVERAGE_POOL_FORWARD", .cmd = 0x51267ab8}, {.name = "CCV_NNC_AVERAGE_POOL_BACKWARD", .cmd = 0x51267ab9}, - {.name = "CCV_NNC_EWEXP_FORWARD", .cmd = 0xd784b170}, - {.name = "CCV_NNC_EWEXP_BACKWARD", .cmd = 0xd784b171}, - {.name = "CCV_NNC_NMS_FORWARD", .cmd = 0xdba26106}, - {.name = "CCV_NNC_NMS_BACKWARD", .cmd = 0xdba26107}, - {.name = "CCV_NNC_RMSNORM_FORWARD", .cmd = 0x6889e9d0}, - {.name = "CCV_NNC_RMSNORM_BACKWARD", .cmd = 0x6889e9d1}, - {.name = "CCV_NNC_MIN_FORWARD", .cmd = 0x972fbd26}, - {.name = "CCV_NNC_MIN_BACKWARD", .cmd = 0x972fbd27}, + {.name = "CCV_NNC_EWLOG_FORWARD", .cmd = 0xf4191bf2}, + {.name = "CCV_NNC_EWLOG_BACKWARD", .cmd = 0xf4191bf3}, + {.name = "CCV_NNC_SCALAR_MUL_FORWARD", .cmd = 0x8b4d86aa}, + {.name = "CCV_NNC_SCALAR_MUL_BACKWARD", .cmd = 0x8b4d86ab}, + {.name = "CCV_NNC_CONVOLUTION_FORWARD", .cmd = 0x254d05f4}, + {.name = "CCV_NNC_CONVOLUTION_BACKWARD", .cmd = 0x254d05f5}, {.name = "CCV_NNC_MUL_FORWARD", .cmd = 0x24721a46}, {.name = "CCV_NNC_MUL_BACKWARD", .cmd = 0x24721a47}, - {.name = "CCV_NNC_GEMM_FORWARD", .cmd = 0x7e87d00c}, - {.name = "CCV_NNC_GEMM_BACKWARD", .cmd = 0x7e87d00d}, - {.name = "CCV_NNC_LAMB_FORWARD", .cmd = 0x450edb1a}, - {.name = "CCV_NNC_LAMB_BACKWARD", .cmd = 0x450edb1b}, + {.name = "CCV_NNC_ADAM_FORWARD", .cmd = 0xe30099dc}, + {.name = "CCV_NNC_ADAM_BACKWARD", .cmd = 0xe30099dd}, {.name = "CCV_NNC_ADD_FORWARD", .cmd = 0x58fb3664}, {.name = "CCV_NNC_ADD_BACKWARD", .cmd = 0x58fb3665}, - {.name = "CCV_NNC_UPSAMPLE_FORWARD", .cmd = 0x73875556}, - {.name = "CCV_NNC_UPSAMPLE_BACKWARD", .cmd = 0x73875557}, - {.name = "CCV_NNC_LEAKY_RELU_FORWARD", .cmd = 0x507144e0}, - {.name = "CCV_NNC_LEAKY_RELU_BACKWARD", .cmd = 0x507144e1}, - {.name = "CCV_NNC_EWSQRT_FORWARD", .cmd = 0x8870a61e}, - {.name = "CCV_NNC_EWSQRT_BACKWARD", .cmd = 0x8870a61f}, - {.name = "CCV_NNC_RELU_FORWARD", .cmd = 0xc51eaa80}, - {.name = "CCV_NNC_RELU_BACKWARD", .cmd = 0xc51eaa81}, - {.name = "CCV_NNC_REDUCE_NORM2_FORWARD", .cmd = 0xb3034e16}, - {.name = "CCV_NNC_REDUCE_NORM2_BACKWARD", .cmd = 0xb3034e17}, - {.name = "CCV_NNC_BINARY_CROSSENTROPY_FORWARD", .cmd = 0xcd2107ec}, - {.name = "CCV_NNC_BINARY_CROSSENTROPY_BACKWARD", .cmd = 0xcd2107ed}, - {.name = "CCV_NNC_CLAMP_FORWARD", .cmd = 0x2640d854}, - {.name = "CCV_NNC_CLAMP_BACKWARD", .cmd = 0x2640d855}, - {.name = "CCV_NNC_SIGMOID_FORWARD", .cmd = 0xf2f69650}, - {.name = "CCV_NNC_SIGMOID_BACKWARD", .cmd = 0xf2f69651}, - {.name = "CCV_NNC_SMOOTH_L1_FORWARD", .cmd = 0x4e428e}, - {.name = "CCV_NNC_SMOOTH_L1_BACKWARD", .cmd = 0x4e428f}, {.name = "CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD", .cmd = 0x1eb327a2}, {.name = "CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD", .cmd = 0x1eb327a3}, - {.name = "CCV_NNC_DATA_TRANSFER_FORWARD", .cmd = 0x12d21e1a}, - {.name = "CCV_NNC_DATA_TRANSFER_BACKWARD", .cmd = 0x12d21e1b}, - {.name = "CCV_NNC_ADAMW_FORWARD", .cmd = 0x4f5d4870}, - {.name = "CCV_NNC_ADAMW_BACKWARD", .cmd = 0x4f5d4871}, - {.name = "CCV_NNC_DROPOUT_FORWARD", .cmd = 0x7f2dc3e4}, - {.name = "CCV_NNC_DROPOUT_BACKWARD", .cmd = 0x7f2dc3e5}, - {.name = "CCV_NNC_EWDIV_FORWARD", .cmd = 0x1cd2fa18}, - {.name = "CCV_NNC_EWDIV_BACKWARD", .cmd = 0x1cd2fa19}, - {.name = "CCV_NNC_GELU_FORWARD", .cmd = 0xb1527ab8}, - {.name = "CCV_NNC_GELU_BACKWARD", .cmd = 0xb1527ab9}, - {.name = "CCV_NNC_SOFTMAX_FORWARD", .cmd = 0xc969a252}, - {.name = "CCV_NNC_SOFTMAX_BACKWARD", .cmd = 0xc969a253}, - {.name = "CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD", .cmd = 0xd691f78e}, - {.name = "CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD", .cmd = 0xd691f78f}, - {.name = "CCV_NNC_COMM_REDUCE_FORWARD", .cmd = 0x3434ead8}, - {.name = "CCV_NNC_COMM_REDUCE_BACKWARD", .cmd = 0x3434ead9}, - {.name = "CCV_NNC_TANH_FORWARD", .cmd = 0x6a62be30}, - {.name = "CCV_NNC_TANH_BACKWARD", .cmd = 0x6a62be31}, - {.name = "CCV_NNC_MSE_FORWARD", .cmd = 0x6904a9a2}, - {.name = "CCV_NNC_MSE_BACKWARD", .cmd = 0x6904a9a3}, - {.name = "CCV_NNC_REDUCE_SUM_FORWARD", .cmd = 0x52970f06}, - {.name = "CCV_NNC_REDUCE_SUM_BACKWARD", .cmd = 0x52970f07}, - {.name = "CCV_NNC_COMM_BROADCAST_FORWARD", .cmd = 0x830eee}, - {.name = "CCV_NNC_COMM_BROADCAST_BACKWARD", .cmd = 0x830eef}, - {.name = "CCV_NNC_MAX_FORWARD", .cmd = 0xdf6f014c}, - {.name = "CCV_NNC_MAX_BACKWARD", .cmd = 0xdf6f014d}, - {.name = "CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD", .cmd = 0xc26b7b5e}, - {.name = "CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD", .cmd = 0xc26b7b5f}, - {.name = "CCV_NNC_HISTOGRAM_FORWARD", .cmd = 0xc5473e44}, - {.name = "CCV_NNC_HISTOGRAM_BACKWARD", .cmd = 0xc5473e45}, - {.name = "CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD", .cmd = 0xd9e0e4a}, - {.name = "CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD", .cmd = 0xd9e0e4b}, - {.name = "CCV_NNC_PAD_FORWARD", .cmd = 0xd8aaca60}, - {.name = "CCV_NNC_PAD_BACKWARD", .cmd = 0xd8aaca61}, + {.name = "CCV_NNC_EWSUM_FORWARD", .cmd = 0xe21a2c4c}, + {.name = "CCV_NNC_EWSUM_BACKWARD", .cmd = 0xe21a2c4d}, {.name = "CCV_NNC_COMPRESSION_LSSC_FORWARD", .cmd = 0x17ea8f72}, {.name = "CCV_NNC_COMPRESSION_LSSC_BACKWARD", .cmd = 0x17ea8f73}, - {.name = "CCV_NNC_ADAM_FORWARD", .cmd = 0xe30099dc}, - {.name = "CCV_NNC_ADAM_BACKWARD", .cmd = 0xe30099dd}, - {.name = "CCV_NNC_MAX_POOL_FORWARD", .cmd = 0x7bec9360}, - {.name = "CCV_NNC_MAX_POOL_BACKWARD", .cmd = 0x7bec9361}, - {.name = "CCV_NNC_EWPROD_FORWARD", .cmd = 0xee07e8fe}, - {.name = "CCV_NNC_EWPROD_BACKWARD", .cmd = 0xee07e8ff}, - {.name = "CCV_NNC_CONVOLUTION_FORWARD", .cmd = 0x254d05f4}, - {.name = "CCV_NNC_CONVOLUTION_BACKWARD", .cmd = 0x254d05f5}, - {.name = "CCV_NNC_ARGMAX_FORWARD", .cmd = 0x68af2804}, - {.name = "CCV_NNC_ARGMAX_BACKWARD", .cmd = 0x68af2805}, - {.name = "CCV_NNC_REDUCE_ISNAN_FORWARD", .cmd = 0xee0a4ade}, - {.name = "CCV_NNC_REDUCE_ISNAN_BACKWARD", .cmd = 0xee0a4adf}, + {.name = "CCV_NNC_ARGMIN_FORWARD", .cmd = 0xeb8747f2}, + {.name = "CCV_NNC_ARGMIN_BACKWARD", .cmd = 0xeb8747f3}, + {.name = "CCV_NNC_COMM_BROADCAST_FORWARD", .cmd = 0x830eee}, + {.name = "CCV_NNC_COMM_BROADCAST_BACKWARD", .cmd = 0x830eef}, + {.name = "CCV_NNC_COMM_REDUCE_FORWARD", .cmd = 0x3434ead8}, + {.name = "CCV_NNC_COMM_REDUCE_BACKWARD", .cmd = 0x3434ead9}, + {.name = "CCV_NNC_LEAKY_RELU_FORWARD", .cmd = 0x507144e0}, + {.name = "CCV_NNC_LEAKY_RELU_BACKWARD", .cmd = 0x507144e1}, + {.name = "CCV_NNC_CMUL_FORWARD", .cmd = 0xead486e6}, + {.name = "CCV_NNC_CMUL_BACKWARD", .cmd = 0xead486e7}, + {.name = "CCV_NNC_REDUCE_MIN_FORWARD", .cmd = 0x6785ef96}, + {.name = "CCV_NNC_REDUCE_MIN_BACKWARD", .cmd = 0x6785ef97}, + {.name = "CCV_NNC_LAYER_NORM_FORWARD", .cmd = 0xbed3c264}, + {.name = "CCV_NNC_LAYER_NORM_BACKWARD", .cmd = 0xbed3c265}, }; static ccv_nnc_cmd_backend_init_t backend_init_map[] = { @@ -151,31 +151,31 @@ static ccv_nnc_cmd_backend_init_t backend_init_map[] = { static inline int _ccv_nnc_cmd_ph(const uint32_t cmd) { - switch ((cmd >> 20) % 11) + switch ((cmd >> 25) % 11) { case 0: - return ((((cmd >> 1) % 23) + 16) << 1) | (cmd & 1); + return ((((cmd >> 1) % 20) + 44) << 1) | (cmd & 1); case 1: - return ((((cmd >> 4) % 20) + 47) << 1) | (cmd & 1); + return ((((cmd >> 1) % 34) + 22) << 1) | (cmd & 1); case 2: - return ((((cmd >> 14) % 7) + 62) << 1) | (cmd & 1); + return ((((cmd >> 3) % 18) + 10) << 1) | (cmd & 1); case 3: - return ((((cmd >> 9) % 24) + 0) << 1) | (cmd & 1); + return ((((cmd >> 8) % 48) + 16) << 1) | (cmd & 1); case 4: - return ((((cmd >> 1) % 19) + 31) << 1) | (cmd & 1); + return ((((cmd >> 11) % 20) + 47) << 1) | (cmd & 1); case 5: - return ((((cmd >> 4) % 39) + 9) << 1) | (cmd & 1); + return ((((cmd >> 1) % 1) + 8) << 1) | (cmd & 1); case 6: - return ((((cmd >> 1) % 31) + 0) << 1) | (cmd & 1); + return ((((cmd >> 6) % 51) + 10) << 1) | (cmd & 1); case 7: - return ((((cmd >> 2) % 66) + 2) << 1) | (cmd & 1); + return ((((cmd >> 1) % 17) + 52) << 1) | (cmd & 1); case 8: - return ((((cmd >> 1) % 14) + 51) << 1) | (cmd & 1); + return ((((cmd >> 1) % 35) + 12) << 1) | (cmd & 1); case 9: - return ((((cmd >> 1) % 30) + 5) << 1) | (cmd & 1); + return ((((cmd >> 1) % 22) + 0) << 1) | (cmd & 1); case 10: default: - return ((((cmd >> 1) % 17) + 47) << 1) | (cmd & 1); + return ((((cmd >> 4) % 11) + 1) << 1) | (cmd & 1); } } @@ -191,142 +191,142 @@ static inline int _ccv_nnc_cmd_backend_ph(const uint32_t backend) void _register_command_CCV_NNC_BATCH_NORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_BATCH_NORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ARGMIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ARGMIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_MEAN_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_MIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_MIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_LSTM_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_LSTM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SET_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SET_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_DATA_TRANSFER_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_HISTOGRAM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_HISTOGRAM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_REDUCE_MAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_REDUCE_MAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CMUL_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CMUL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_PAD_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_PAD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWPROD_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWPROD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SGD_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SGD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_NMS_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_NMS_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_DROPOUT_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_DROPOUT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWSQRT_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWSQRT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_TANH_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_TANH_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ADAMW_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ADAMW_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_INDEX_SELECT_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_INDEX_SELECT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CLAMP_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CLAMP_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_UPSAMPLE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_UPSAMPLE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SOFTMAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SOFTMAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_TRANSPOSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_TRANSPOSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_NORM2_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWSUM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWSUM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SCALAR_MUL_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SCALAR_MUL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SET_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SET_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LAYER_NORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LAYER_NORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SWISH_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SWISH_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_GEMM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_GEMM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LAMB_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LAMB_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_SUM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_SUM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWEXP_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWEXP_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_ROI_ALIGN_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_ROI_ALIGN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_TRANSPOSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_TRANSPOSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWLOG_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWLOG_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_MASKED_FILL_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_MASKED_FILL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_RMSPROP_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_RMSPROP_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SGD_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SGD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ARGMAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ARGMAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWDIV_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWDIV_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RMSNORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_RMSNORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_GELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_GELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SWISH_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SWISH_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MAX_POOL_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_MAX_POOL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SMOOTH_L1_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SMOOTH_L1_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SIGMOID_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SIGMOID_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_GROUP_NORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_GROUP_NORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_INDEX_SELECT_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_INDEX_SELECT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_MEAN_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_AVERAGE_POOL_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWEXP_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWEXP_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_NMS_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_NMS_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RMSNORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RMSNORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWLOG_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWLOG_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SCALAR_MUL_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_SCALAR_MUL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CONVOLUTION_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CONVOLUTION_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_MUL_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_MUL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_GEMM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_GEMM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LAMB_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LAMB_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ADAM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ADAM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_ADD_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_ADD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_UPSAMPLE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_UPSAMPLE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LEAKY_RELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_LEAKY_RELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWSQRT_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWSQRT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_RELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_NORM2_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CLAMP_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CLAMP_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SIGMOID_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SIGMOID_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SMOOTH_L1_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SMOOTH_L1_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_DATA_TRANSFER_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ADAMW_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ADAMW_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_DROPOUT_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_DROPOUT_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWDIV_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWDIV_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_GELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_GELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SOFTMAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SOFTMAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_REDUCE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_REDUCE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_TANH_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_TANH_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MSE_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MSE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_SUM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_SUM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_BROADCAST_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_HISTOGRAM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_HISTOGRAM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_PAD_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_PAD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWSUM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_EWSUM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ADAM_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ADAM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MAX_POOL_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_MAX_POOL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWPROD_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_EWPROD_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CONVOLUTION_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_CONVOLUTION_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ARGMAX_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_ARGMAX_BACKWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD(ccv_nnc_cmd_registry_t* const registry); -void _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ARGMIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_ARGMIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_BROADCAST_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_REDUCE_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_COMM_REDUCE_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LEAKY_RELU_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LEAKY_RELU_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CMUL_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_CMUL_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_MIN_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_REDUCE_MIN_BACKWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LAYER_NORM_FORWARD(ccv_nnc_cmd_registry_t* const registry); +void _register_command_CCV_NNC_LAYER_NORM_BACKWARD(ccv_nnc_cmd_registry_t* const registry); void _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_ADAM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry); @@ -605,6 +605,7 @@ void _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_ void _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); +void _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); void _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(ccv_nnc_cmd_backend_registry_t* const registry); @@ -671,479 +672,480 @@ static inline void _ccv_nnc_cmd_init(void) { _register_command_CCV_NNC_BATCH_NORM_FORWARD(&init_map[0].registry); _register_command_CCV_NNC_BATCH_NORM_BACKWARD(&init_map[1].registry); - _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD(&init_map[2].registry); - _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD(&init_map[3].registry); - _register_command_CCV_NNC_ARGMIN_FORWARD(&init_map[4].registry); - _register_command_CCV_NNC_ARGMIN_BACKWARD(&init_map[5].registry); - _register_command_CCV_NNC_REDUCE_MEAN_FORWARD(&init_map[6].registry); - _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD(&init_map[7].registry); - _register_command_CCV_NNC_REDUCE_MIN_FORWARD(&init_map[8].registry); - _register_command_CCV_NNC_REDUCE_MIN_BACKWARD(&init_map[9].registry); - _register_command_CCV_NNC_LSTM_FORWARD(&init_map[10].registry); - _register_command_CCV_NNC_LSTM_BACKWARD(&init_map[11].registry); - _register_command_CCV_NNC_REDUCE_MAX_FORWARD(&init_map[12].registry); - _register_command_CCV_NNC_REDUCE_MAX_BACKWARD(&init_map[13].registry); - _register_command_CCV_NNC_CMUL_FORWARD(&init_map[14].registry); - _register_command_CCV_NNC_CMUL_BACKWARD(&init_map[15].registry); - _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD(&init_map[16].registry); - _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD(&init_map[17].registry); - _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD(&init_map[18].registry); - _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD(&init_map[19].registry); - _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD(&init_map[20].registry); - _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD(&init_map[21].registry); - _register_command_CCV_NNC_EWSUM_FORWARD(&init_map[22].registry); - _register_command_CCV_NNC_EWSUM_BACKWARD(&init_map[23].registry); - _register_command_CCV_NNC_SCALAR_MUL_FORWARD(&init_map[24].registry); - _register_command_CCV_NNC_SCALAR_MUL_BACKWARD(&init_map[25].registry); - _register_command_CCV_NNC_SET_FORWARD(&init_map[26].registry); - _register_command_CCV_NNC_SET_BACKWARD(&init_map[27].registry); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(&init_map[28].registry); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(&init_map[29].registry); - _register_command_CCV_NNC_LAYER_NORM_FORWARD(&init_map[30].registry); - _register_command_CCV_NNC_LAYER_NORM_BACKWARD(&init_map[31].registry); - _register_command_CCV_NNC_SWISH_FORWARD(&init_map[32].registry); - _register_command_CCV_NNC_SWISH_BACKWARD(&init_map[33].registry); - _register_command_CCV_NNC_ROI_ALIGN_FORWARD(&init_map[34].registry); - _register_command_CCV_NNC_ROI_ALIGN_BACKWARD(&init_map[35].registry); - _register_command_CCV_NNC_TRANSPOSE_FORWARD(&init_map[36].registry); - _register_command_CCV_NNC_TRANSPOSE_BACKWARD(&init_map[37].registry); - _register_command_CCV_NNC_EWLOG_FORWARD(&init_map[38].registry); - _register_command_CCV_NNC_EWLOG_BACKWARD(&init_map[39].registry); - _register_command_CCV_NNC_MASKED_FILL_FORWARD(&init_map[40].registry); - _register_command_CCV_NNC_MASKED_FILL_BACKWARD(&init_map[41].registry); - _register_command_CCV_NNC_RMSPROP_FORWARD(&init_map[42].registry); - _register_command_CCV_NNC_RMSPROP_BACKWARD(&init_map[43].registry); - _register_command_CCV_NNC_SGD_FORWARD(&init_map[44].registry); - _register_command_CCV_NNC_SGD_BACKWARD(&init_map[45].registry); - _register_command_CCV_NNC_GROUP_NORM_FORWARD(&init_map[46].registry); - _register_command_CCV_NNC_GROUP_NORM_BACKWARD(&init_map[47].registry); - _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD(&init_map[48].registry); - _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD(&init_map[49].registry); - _register_command_CCV_NNC_INDEX_SELECT_FORWARD(&init_map[50].registry); - _register_command_CCV_NNC_INDEX_SELECT_BACKWARD(&init_map[51].registry); - _register_command_CCV_NNC_AVERAGE_POOL_FORWARD(&init_map[52].registry); - _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD(&init_map[53].registry); - _register_command_CCV_NNC_EWEXP_FORWARD(&init_map[54].registry); - _register_command_CCV_NNC_EWEXP_BACKWARD(&init_map[55].registry); - _register_command_CCV_NNC_NMS_FORWARD(&init_map[56].registry); - _register_command_CCV_NNC_NMS_BACKWARD(&init_map[57].registry); - _register_command_CCV_NNC_RMSNORM_FORWARD(&init_map[58].registry); - _register_command_CCV_NNC_RMSNORM_BACKWARD(&init_map[59].registry); - _register_command_CCV_NNC_MIN_FORWARD(&init_map[60].registry); - _register_command_CCV_NNC_MIN_BACKWARD(&init_map[61].registry); - _register_command_CCV_NNC_MUL_FORWARD(&init_map[62].registry); - _register_command_CCV_NNC_MUL_BACKWARD(&init_map[63].registry); - _register_command_CCV_NNC_GEMM_FORWARD(&init_map[64].registry); - _register_command_CCV_NNC_GEMM_BACKWARD(&init_map[65].registry); + _register_command_CCV_NNC_LSTM_FORWARD(&init_map[2].registry); + _register_command_CCV_NNC_LSTM_BACKWARD(&init_map[3].registry); + _register_command_CCV_NNC_SET_FORWARD(&init_map[4].registry); + _register_command_CCV_NNC_SET_BACKWARD(&init_map[5].registry); + _register_command_CCV_NNC_DATA_TRANSFER_FORWARD(&init_map[6].registry); + _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD(&init_map[7].registry); + _register_command_CCV_NNC_HISTOGRAM_FORWARD(&init_map[8].registry); + _register_command_CCV_NNC_HISTOGRAM_BACKWARD(&init_map[9].registry); + _register_command_CCV_NNC_REDUCE_MAX_FORWARD(&init_map[10].registry); + _register_command_CCV_NNC_REDUCE_MAX_BACKWARD(&init_map[11].registry); + _register_command_CCV_NNC_PAD_FORWARD(&init_map[12].registry); + _register_command_CCV_NNC_PAD_BACKWARD(&init_map[13].registry); + _register_command_CCV_NNC_EWPROD_FORWARD(&init_map[14].registry); + _register_command_CCV_NNC_EWPROD_BACKWARD(&init_map[15].registry); + _register_command_CCV_NNC_SGD_FORWARD(&init_map[16].registry); + _register_command_CCV_NNC_SGD_BACKWARD(&init_map[17].registry); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(&init_map[18].registry); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(&init_map[19].registry); + _register_command_CCV_NNC_RELU_FORWARD(&init_map[20].registry); + _register_command_CCV_NNC_RELU_BACKWARD(&init_map[21].registry); + _register_command_CCV_NNC_NMS_FORWARD(&init_map[22].registry); + _register_command_CCV_NNC_NMS_BACKWARD(&init_map[23].registry); + _register_command_CCV_NNC_DROPOUT_FORWARD(&init_map[24].registry); + _register_command_CCV_NNC_DROPOUT_BACKWARD(&init_map[25].registry); + _register_command_CCV_NNC_EWSQRT_FORWARD(&init_map[26].registry); + _register_command_CCV_NNC_EWSQRT_BACKWARD(&init_map[27].registry); + _register_command_CCV_NNC_TANH_FORWARD(&init_map[28].registry); + _register_command_CCV_NNC_TANH_BACKWARD(&init_map[29].registry); + _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD(&init_map[30].registry); + _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD(&init_map[31].registry); + _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD(&init_map[32].registry); + _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD(&init_map[33].registry); + _register_command_CCV_NNC_MIN_FORWARD(&init_map[34].registry); + _register_command_CCV_NNC_MIN_BACKWARD(&init_map[35].registry); + _register_command_CCV_NNC_ADAMW_FORWARD(&init_map[36].registry); + _register_command_CCV_NNC_ADAMW_BACKWARD(&init_map[37].registry); + _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD(&init_map[38].registry); + _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD(&init_map[39].registry); + _register_command_CCV_NNC_INDEX_SELECT_FORWARD(&init_map[40].registry); + _register_command_CCV_NNC_INDEX_SELECT_BACKWARD(&init_map[41].registry); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD(&init_map[42].registry); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD(&init_map[43].registry); + _register_command_CCV_NNC_MSE_FORWARD(&init_map[44].registry); + _register_command_CCV_NNC_MSE_BACKWARD(&init_map[45].registry); + _register_command_CCV_NNC_CLAMP_FORWARD(&init_map[46].registry); + _register_command_CCV_NNC_CLAMP_BACKWARD(&init_map[47].registry); + _register_command_CCV_NNC_UPSAMPLE_FORWARD(&init_map[48].registry); + _register_command_CCV_NNC_UPSAMPLE_BACKWARD(&init_map[49].registry); + _register_command_CCV_NNC_SOFTMAX_FORWARD(&init_map[50].registry); + _register_command_CCV_NNC_SOFTMAX_BACKWARD(&init_map[51].registry); + _register_command_CCV_NNC_TRANSPOSE_FORWARD(&init_map[52].registry); + _register_command_CCV_NNC_TRANSPOSE_BACKWARD(&init_map[53].registry); + _register_command_CCV_NNC_REDUCE_NORM2_FORWARD(&init_map[54].registry); + _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD(&init_map[55].registry); + _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD(&init_map[56].registry); + _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD(&init_map[57].registry); + _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD(&init_map[58].registry); + _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD(&init_map[59].registry); + _register_command_CCV_NNC_GEMM_FORWARD(&init_map[60].registry); + _register_command_CCV_NNC_GEMM_BACKWARD(&init_map[61].registry); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD(&init_map[62].registry); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD(&init_map[63].registry); + _register_command_CCV_NNC_MAX_FORWARD(&init_map[64].registry); + _register_command_CCV_NNC_MAX_BACKWARD(&init_map[65].registry); _register_command_CCV_NNC_LAMB_FORWARD(&init_map[66].registry); _register_command_CCV_NNC_LAMB_BACKWARD(&init_map[67].registry); - _register_command_CCV_NNC_ADD_FORWARD(&init_map[68].registry); - _register_command_CCV_NNC_ADD_BACKWARD(&init_map[69].registry); - _register_command_CCV_NNC_UPSAMPLE_FORWARD(&init_map[70].registry); - _register_command_CCV_NNC_UPSAMPLE_BACKWARD(&init_map[71].registry); - _register_command_CCV_NNC_LEAKY_RELU_FORWARD(&init_map[72].registry); - _register_command_CCV_NNC_LEAKY_RELU_BACKWARD(&init_map[73].registry); - _register_command_CCV_NNC_EWSQRT_FORWARD(&init_map[74].registry); - _register_command_CCV_NNC_EWSQRT_BACKWARD(&init_map[75].registry); - _register_command_CCV_NNC_RELU_FORWARD(&init_map[76].registry); - _register_command_CCV_NNC_RELU_BACKWARD(&init_map[77].registry); - _register_command_CCV_NNC_REDUCE_NORM2_FORWARD(&init_map[78].registry); - _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD(&init_map[79].registry); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD(&init_map[80].registry); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD(&init_map[81].registry); - _register_command_CCV_NNC_CLAMP_FORWARD(&init_map[82].registry); - _register_command_CCV_NNC_CLAMP_BACKWARD(&init_map[83].registry); - _register_command_CCV_NNC_SIGMOID_FORWARD(&init_map[84].registry); - _register_command_CCV_NNC_SIGMOID_BACKWARD(&init_map[85].registry); - _register_command_CCV_NNC_SMOOTH_L1_FORWARD(&init_map[86].registry); - _register_command_CCV_NNC_SMOOTH_L1_BACKWARD(&init_map[87].registry); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD(&init_map[88].registry); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD(&init_map[89].registry); - _register_command_CCV_NNC_DATA_TRANSFER_FORWARD(&init_map[90].registry); - _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD(&init_map[91].registry); - _register_command_CCV_NNC_ADAMW_FORWARD(&init_map[92].registry); - _register_command_CCV_NNC_ADAMW_BACKWARD(&init_map[93].registry); - _register_command_CCV_NNC_DROPOUT_FORWARD(&init_map[94].registry); - _register_command_CCV_NNC_DROPOUT_BACKWARD(&init_map[95].registry); - _register_command_CCV_NNC_EWDIV_FORWARD(&init_map[96].registry); - _register_command_CCV_NNC_EWDIV_BACKWARD(&init_map[97].registry); - _register_command_CCV_NNC_GELU_FORWARD(&init_map[98].registry); - _register_command_CCV_NNC_GELU_BACKWARD(&init_map[99].registry); - _register_command_CCV_NNC_SOFTMAX_FORWARD(&init_map[100].registry); - _register_command_CCV_NNC_SOFTMAX_BACKWARD(&init_map[101].registry); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD(&init_map[102].registry); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD(&init_map[103].registry); - _register_command_CCV_NNC_COMM_REDUCE_FORWARD(&init_map[104].registry); - _register_command_CCV_NNC_COMM_REDUCE_BACKWARD(&init_map[105].registry); - _register_command_CCV_NNC_TANH_FORWARD(&init_map[106].registry); - _register_command_CCV_NNC_TANH_BACKWARD(&init_map[107].registry); - _register_command_CCV_NNC_MSE_FORWARD(&init_map[108].registry); - _register_command_CCV_NNC_MSE_BACKWARD(&init_map[109].registry); - _register_command_CCV_NNC_REDUCE_SUM_FORWARD(&init_map[110].registry); - _register_command_CCV_NNC_REDUCE_SUM_BACKWARD(&init_map[111].registry); - _register_command_CCV_NNC_COMM_BROADCAST_FORWARD(&init_map[112].registry); - _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD(&init_map[113].registry); - _register_command_CCV_NNC_MAX_FORWARD(&init_map[114].registry); - _register_command_CCV_NNC_MAX_BACKWARD(&init_map[115].registry); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD(&init_map[116].registry); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD(&init_map[117].registry); - _register_command_CCV_NNC_HISTOGRAM_FORWARD(&init_map[118].registry); - _register_command_CCV_NNC_HISTOGRAM_BACKWARD(&init_map[119].registry); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD(&init_map[120].registry); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD(&init_map[121].registry); - _register_command_CCV_NNC_PAD_FORWARD(&init_map[122].registry); - _register_command_CCV_NNC_PAD_BACKWARD(&init_map[123].registry); - _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD(&init_map[124].registry); - _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD(&init_map[125].registry); - _register_command_CCV_NNC_ADAM_FORWARD(&init_map[126].registry); - _register_command_CCV_NNC_ADAM_BACKWARD(&init_map[127].registry); - _register_command_CCV_NNC_MAX_POOL_FORWARD(&init_map[128].registry); - _register_command_CCV_NNC_MAX_POOL_BACKWARD(&init_map[129].registry); - _register_command_CCV_NNC_EWPROD_FORWARD(&init_map[130].registry); - _register_command_CCV_NNC_EWPROD_BACKWARD(&init_map[131].registry); - _register_command_CCV_NNC_CONVOLUTION_FORWARD(&init_map[132].registry); - _register_command_CCV_NNC_CONVOLUTION_BACKWARD(&init_map[133].registry); - _register_command_CCV_NNC_ARGMAX_FORWARD(&init_map[134].registry); - _register_command_CCV_NNC_ARGMAX_BACKWARD(&init_map[135].registry); - _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD(&init_map[136].registry); - _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD(&init_map[137].registry); + _register_command_CCV_NNC_REDUCE_SUM_FORWARD(&init_map[68].registry); + _register_command_CCV_NNC_REDUCE_SUM_BACKWARD(&init_map[69].registry); + _register_command_CCV_NNC_EWEXP_FORWARD(&init_map[70].registry); + _register_command_CCV_NNC_EWEXP_BACKWARD(&init_map[71].registry); + _register_command_CCV_NNC_ROI_ALIGN_FORWARD(&init_map[72].registry); + _register_command_CCV_NNC_ROI_ALIGN_BACKWARD(&init_map[73].registry); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD(&init_map[74].registry); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD(&init_map[75].registry); + _register_command_CCV_NNC_MASKED_FILL_FORWARD(&init_map[76].registry); + _register_command_CCV_NNC_MASKED_FILL_BACKWARD(&init_map[77].registry); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD(&init_map[78].registry); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD(&init_map[79].registry); + _register_command_CCV_NNC_RMSPROP_FORWARD(&init_map[80].registry); + _register_command_CCV_NNC_RMSPROP_BACKWARD(&init_map[81].registry); + _register_command_CCV_NNC_ARGMAX_FORWARD(&init_map[82].registry); + _register_command_CCV_NNC_ARGMAX_BACKWARD(&init_map[83].registry); + _register_command_CCV_NNC_EWDIV_FORWARD(&init_map[84].registry); + _register_command_CCV_NNC_EWDIV_BACKWARD(&init_map[85].registry); + _register_command_CCV_NNC_RMSNORM_FORWARD(&init_map[86].registry); + _register_command_CCV_NNC_RMSNORM_BACKWARD(&init_map[87].registry); + _register_command_CCV_NNC_GELU_FORWARD(&init_map[88].registry); + _register_command_CCV_NNC_GELU_BACKWARD(&init_map[89].registry); + _register_command_CCV_NNC_SWISH_FORWARD(&init_map[90].registry); + _register_command_CCV_NNC_SWISH_BACKWARD(&init_map[91].registry); + _register_command_CCV_NNC_MAX_POOL_FORWARD(&init_map[92].registry); + _register_command_CCV_NNC_MAX_POOL_BACKWARD(&init_map[93].registry); + _register_command_CCV_NNC_SMOOTH_L1_FORWARD(&init_map[94].registry); + _register_command_CCV_NNC_SMOOTH_L1_BACKWARD(&init_map[95].registry); + _register_command_CCV_NNC_SIGMOID_FORWARD(&init_map[96].registry); + _register_command_CCV_NNC_SIGMOID_BACKWARD(&init_map[97].registry); + _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD(&init_map[98].registry); + _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD(&init_map[99].registry); + _register_command_CCV_NNC_GROUP_NORM_FORWARD(&init_map[100].registry); + _register_command_CCV_NNC_GROUP_NORM_BACKWARD(&init_map[101].registry); + _register_command_CCV_NNC_REDUCE_MEAN_FORWARD(&init_map[102].registry); + _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD(&init_map[103].registry); + _register_command_CCV_NNC_AVERAGE_POOL_FORWARD(&init_map[104].registry); + _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD(&init_map[105].registry); + _register_command_CCV_NNC_EWLOG_FORWARD(&init_map[106].registry); + _register_command_CCV_NNC_EWLOG_BACKWARD(&init_map[107].registry); + _register_command_CCV_NNC_SCALAR_MUL_FORWARD(&init_map[108].registry); + _register_command_CCV_NNC_SCALAR_MUL_BACKWARD(&init_map[109].registry); + _register_command_CCV_NNC_CONVOLUTION_FORWARD(&init_map[110].registry); + _register_command_CCV_NNC_CONVOLUTION_BACKWARD(&init_map[111].registry); + _register_command_CCV_NNC_MUL_FORWARD(&init_map[112].registry); + _register_command_CCV_NNC_MUL_BACKWARD(&init_map[113].registry); + _register_command_CCV_NNC_ADAM_FORWARD(&init_map[114].registry); + _register_command_CCV_NNC_ADAM_BACKWARD(&init_map[115].registry); + _register_command_CCV_NNC_ADD_FORWARD(&init_map[116].registry); + _register_command_CCV_NNC_ADD_BACKWARD(&init_map[117].registry); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD(&init_map[118].registry); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD(&init_map[119].registry); + _register_command_CCV_NNC_EWSUM_FORWARD(&init_map[120].registry); + _register_command_CCV_NNC_EWSUM_BACKWARD(&init_map[121].registry); + _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD(&init_map[122].registry); + _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD(&init_map[123].registry); + _register_command_CCV_NNC_ARGMIN_FORWARD(&init_map[124].registry); + _register_command_CCV_NNC_ARGMIN_BACKWARD(&init_map[125].registry); + _register_command_CCV_NNC_COMM_BROADCAST_FORWARD(&init_map[126].registry); + _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD(&init_map[127].registry); + _register_command_CCV_NNC_COMM_REDUCE_FORWARD(&init_map[128].registry); + _register_command_CCV_NNC_COMM_REDUCE_BACKWARD(&init_map[129].registry); + _register_command_CCV_NNC_LEAKY_RELU_FORWARD(&init_map[130].registry); + _register_command_CCV_NNC_LEAKY_RELU_BACKWARD(&init_map[131].registry); + _register_command_CCV_NNC_CMUL_FORWARD(&init_map[132].registry); + _register_command_CCV_NNC_CMUL_BACKWARD(&init_map[133].registry); + _register_command_CCV_NNC_REDUCE_MIN_FORWARD(&init_map[134].registry); + _register_command_CCV_NNC_REDUCE_MIN_BACKWARD(&init_map[135].registry); + _register_command_CCV_NNC_LAYER_NORM_FORWARD(&init_map[136].registry); + _register_command_CCV_NNC_LAYER_NORM_BACKWARD(&init_map[137].registry); - _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[126].backends[2])); - _register_command_CCV_NNC_ADAM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[127].backends[2])); - _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[92].backends[2])); - _register_command_CCV_NNC_ADAMW_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[93].backends[2])); - _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[64].backends[2])); - _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[64].backends[4])); - _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[65].backends[2])); - _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[65].backends[4])); - _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[68].backends[2])); - _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[69].backends[2])); - _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[62].backends[2])); - _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[63].backends[2])); - _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[24].backends[2])); - _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[25].backends[2])); - _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[14].backends[2])); - _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[15].backends[2])); - _register_command_CCV_NNC_MIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[60].backends[2])); - _register_command_CCV_NNC_MIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[61].backends[2])); - _register_command_CCV_NNC_MAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[114].backends[2])); - _register_command_CCV_NNC_MAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[115].backends[2])); - _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[124].backends[2])); - _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[125].backends[2])); - _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[132].backends[2])); - _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[132].backends[4])); - _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[133].backends[2])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[102].backends[2])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[103].backends[2])); - _register_command_CCV_NNC_DROPOUT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[94].backends[2])); - _register_command_CCV_NNC_DROPOUT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[95].backends[2])); - _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[22].backends[2])); - _register_command_CCV_NNC_EWSUM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[23].backends[2])); - _register_command_CCV_NNC_EWPROD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[130].backends[2])); - _register_command_CCV_NNC_EWPROD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[131].backends[2])); - _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[96].backends[2])); - _register_command_CCV_NNC_EWDIV_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[97].backends[2])); - _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[54].backends[2])); - _register_command_CCV_NNC_EWEXP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[55].backends[2])); - _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[38].backends[2])); - _register_command_CCV_NNC_EWLOG_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[39].backends[2])); - _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[74].backends[2])); - _register_command_CCV_NNC_EWSQRT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[75].backends[2])); - _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[82].backends[2])); - _register_command_CCV_NNC_CLAMP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[83].backends[2])); - _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[98].backends[2])); - _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[99].backends[2])); - _register_command_CCV_NNC_HISTOGRAM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[118].backends[2])); - _register_command_CCV_NNC_HISTOGRAM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[119].backends[2])); - _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[50].backends[2])); - _register_command_CCV_NNC_INDEX_SELECT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[51].backends[2])); - _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[136].backends[2])); - _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[137].backends[2])); + _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[114].backends[2])); + _register_command_CCV_NNC_ADAM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[115].backends[2])); + _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[36].backends[2])); + _register_command_CCV_NNC_ADAMW_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[37].backends[2])); + _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[60].backends[2])); + _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[60].backends[4])); + _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[61].backends[2])); + _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[61].backends[4])); + _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[116].backends[2])); + _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[117].backends[2])); + _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[112].backends[2])); + _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[113].backends[2])); + _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[108].backends[2])); + _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[109].backends[2])); + _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[132].backends[2])); + _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[133].backends[2])); + _register_command_CCV_NNC_MIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[34].backends[2])); + _register_command_CCV_NNC_MIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[35].backends[2])); + _register_command_CCV_NNC_MAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[64].backends[2])); + _register_command_CCV_NNC_MAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[65].backends[2])); + _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[122].backends[2])); + _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[123].backends[2])); + _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[110].backends[2])); + _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_OPT(&(init_map[110].backends[4])); + _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[111].backends[2])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[62].backends[2])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[63].backends[2])); + _register_command_CCV_NNC_DROPOUT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[24].backends[2])); + _register_command_CCV_NNC_DROPOUT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[25].backends[2])); + _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[120].backends[2])); + _register_command_CCV_NNC_EWSUM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[121].backends[2])); + _register_command_CCV_NNC_EWPROD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[14].backends[2])); + _register_command_CCV_NNC_EWPROD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[15].backends[2])); + _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[84].backends[2])); + _register_command_CCV_NNC_EWDIV_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[85].backends[2])); + _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[70].backends[2])); + _register_command_CCV_NNC_EWEXP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[71].backends[2])); + _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[106].backends[2])); + _register_command_CCV_NNC_EWLOG_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[107].backends[2])); + _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[26].backends[2])); + _register_command_CCV_NNC_EWSQRT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[27].backends[2])); + _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[46].backends[2])); + _register_command_CCV_NNC_CLAMP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[47].backends[2])); + _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[88].backends[2])); + _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[89].backends[2])); + _register_command_CCV_NNC_HISTOGRAM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[8].backends[2])); + _register_command_CCV_NNC_HISTOGRAM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[9].backends[2])); + _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[40].backends[2])); + _register_command_CCV_NNC_INDEX_SELECT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[41].backends[2])); + _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[30].backends[2])); + _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[31].backends[2])); _register_command_CCV_NNC_LAMB_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[66].backends[2])); _register_command_CCV_NNC_LAMB_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[67].backends[2])); - _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[72].backends[2])); - _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[73].backends[2])); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[80].backends[2])); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[81].backends[2])); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[88].backends[2])); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[89].backends[2])); - _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[108].backends[2])); - _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[109].backends[2])); - _register_command_CCV_NNC_SMOOTH_L1_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[86].backends[2])); - _register_command_CCV_NNC_SMOOTH_L1_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[87].backends[2])); - _register_command_CCV_NNC_NMS_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[56].backends[2])); - _register_command_CCV_NNC_NMS_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[57].backends[2])); + _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[130].backends[2])); + _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[131].backends[2])); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[78].backends[2])); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[79].backends[2])); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[118].backends[2])); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[119].backends[2])); + _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[44].backends[2])); + _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[45].backends[2])); + _register_command_CCV_NNC_SMOOTH_L1_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[94].backends[2])); + _register_command_CCV_NNC_SMOOTH_L1_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[95].backends[2])); + _register_command_CCV_NNC_NMS_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[22].backends[2])); + _register_command_CCV_NNC_NMS_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[23].backends[2])); _register_command_CCV_NNC_BATCH_NORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[0].backends[2])); _register_command_CCV_NNC_BATCH_NORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[1].backends[2])); - _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[30].backends[2])); - _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[31].backends[2])); - _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[46].backends[2])); - _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[47].backends[2])); - _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[58].backends[2])); - _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[59].backends[2])); - _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[122].backends[2])); - _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[123].backends[2])); - _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[128].backends[2])); - _register_command_CCV_NNC_MAX_POOL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[129].backends[2])); - _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[52].backends[2])); - _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[53].backends[2])); - _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[48].backends[2])); - _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[49].backends[2])); - _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[20].backends[2])); - _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[21].backends[2])); - _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[110].backends[2])); - _register_command_CCV_NNC_REDUCE_SUM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[111].backends[2])); - _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[6].backends[2])); - _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[7].backends[2])); - _register_command_CCV_NNC_REDUCE_MAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[12].backends[2])); - _register_command_CCV_NNC_REDUCE_MAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[13].backends[2])); - _register_command_CCV_NNC_REDUCE_MIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[8].backends[2])); - _register_command_CCV_NNC_REDUCE_MIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[9].backends[2])); - _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[78].backends[2])); - _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[79].backends[2])); - _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[134].backends[2])); - _register_command_CCV_NNC_ARGMAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[135].backends[2])); - _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[4].backends[2])); - _register_command_CCV_NNC_ARGMIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[5].backends[2])); - _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[76].backends[2])); - _register_command_CCV_NNC_RELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[77].backends[2])); - _register_command_CCV_NNC_RMSPROP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[42].backends[2])); - _register_command_CCV_NNC_RMSPROP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[43].backends[2])); - _register_command_CCV_NNC_ROI_ALIGN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[34].backends[2])); - _register_command_CCV_NNC_ROI_ALIGN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[35].backends[2])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[28].backends[2])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[29].backends[2])); - _register_command_CCV_NNC_SGD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[44].backends[2])); - _register_command_CCV_NNC_SGD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[45].backends[2])); - _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[84].backends[2])); - _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[85].backends[2])); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[120].backends[2])); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[121].backends[2])); - _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[100].backends[2])); - _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[101].backends[2])); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[116].backends[2])); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[117].backends[2])); - _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[32].backends[2])); - _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[33].backends[2])); - _register_command_CCV_NNC_TANH_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[106].backends[2])); - _register_command_CCV_NNC_TANH_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[107].backends[2])); - _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[70].backends[2])); - _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[71].backends[2])); - _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[26].backends[2])); - _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[27].backends[2])); - _register_command_CCV_NNC_MASKED_FILL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[40].backends[2])); - _register_command_CCV_NNC_MASKED_FILL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[41].backends[2])); - _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[90].backends[2])); - _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[91].backends[2])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[16].backends[2])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[17].backends[2])); - _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[36].backends[2])); - _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[37].backends[2])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[18].backends[2])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[19].backends[2])); + _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[136].backends[2])); + _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[137].backends[2])); + _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[100].backends[2])); + _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[101].backends[2])); + _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[86].backends[2])); + _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[87].backends[2])); + _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[12].backends[2])); + _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[13].backends[2])); + _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[92].backends[2])); + _register_command_CCV_NNC_MAX_POOL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[93].backends[2])); + _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[104].backends[2])); + _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[105].backends[2])); + _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[58].backends[2])); + _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[59].backends[2])); + _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[56].backends[2])); + _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[57].backends[2])); + _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[68].backends[2])); + _register_command_CCV_NNC_REDUCE_SUM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[69].backends[2])); + _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[102].backends[2])); + _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[103].backends[2])); + _register_command_CCV_NNC_REDUCE_MAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[10].backends[2])); + _register_command_CCV_NNC_REDUCE_MAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[11].backends[2])); + _register_command_CCV_NNC_REDUCE_MIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[134].backends[2])); + _register_command_CCV_NNC_REDUCE_MIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[135].backends[2])); + _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[54].backends[2])); + _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[55].backends[2])); + _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[82].backends[2])); + _register_command_CCV_NNC_ARGMAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[83].backends[2])); + _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[124].backends[2])); + _register_command_CCV_NNC_ARGMIN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[125].backends[2])); + _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[20].backends[2])); + _register_command_CCV_NNC_RELU_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[21].backends[2])); + _register_command_CCV_NNC_RMSPROP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[80].backends[2])); + _register_command_CCV_NNC_RMSPROP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[81].backends[2])); + _register_command_CCV_NNC_ROI_ALIGN_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[72].backends[2])); + _register_command_CCV_NNC_ROI_ALIGN_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[73].backends[2])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[18].backends[2])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[19].backends[2])); + _register_command_CCV_NNC_SGD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[16].backends[2])); + _register_command_CCV_NNC_SGD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[17].backends[2])); + _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[96].backends[2])); + _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[97].backends[2])); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[74].backends[2])); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[75].backends[2])); + _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[50].backends[2])); + _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[51].backends[2])); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[42].backends[2])); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[43].backends[2])); + _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[90].backends[2])); + _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[91].backends[2])); + _register_command_CCV_NNC_TANH_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[28].backends[2])); + _register_command_CCV_NNC_TANH_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[29].backends[2])); + _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[48].backends[2])); + _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[49].backends[2])); + _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[4].backends[2])); + _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[5].backends[2])); + _register_command_CCV_NNC_MASKED_FILL_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[76].backends[2])); + _register_command_CCV_NNC_MASKED_FILL_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[77].backends[2])); + _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[6].backends[2])); + _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[7].backends[2])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[98].backends[2])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[99].backends[2])); + _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[52].backends[2])); + _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[53].backends[2])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[32].backends[2])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(&(init_map[33].backends[2])); #ifdef HAVE_CUDA - _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[126].backends[5])); - _register_command_CCV_NNC_ADAM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[127].backends[5])); - _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[92].backends[5])); - _register_command_CCV_NNC_ADAMW_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[93].backends[5])); - _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUBLAS(&(init_map[64].backends[0])); - _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUBLAS(&(init_map[65].backends[0])); - _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[68].backends[3])); - _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[69].backends[3])); - _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[62].backends[3])); - _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[63].backends[3])); - _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[24].backends[3])); - _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[25].backends[3])); - _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[14].backends[5])); - _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[15].backends[5])); - _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[2].backends[1])); - _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[3].backends[1])); - _register_command_CCV_NNC_COMM_BROADCAST_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[112].backends[1])); - _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[113].backends[1])); - _register_command_CCV_NNC_COMM_REDUCE_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[104].backends[1])); - _register_command_CCV_NNC_COMM_REDUCE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[105].backends[1])); - _register_command_CCV_NNC_MIN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[60].backends[5])); - _register_command_CCV_NNC_MIN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[61].backends[5])); - _register_command_CCV_NNC_MAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[114].backends[5])); - _register_command_CCV_NNC_MAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[115].backends[5])); - _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[124].backends[5])); - _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[125].backends[5])); - _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[132].backends[3])); - _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[133].backends[3])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[102].backends[3])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[103].backends[3])); - _register_command_CCV_NNC_DROPOUT_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[94].backends[3])); - _register_command_CCV_NNC_DROPOUT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[95].backends[3])); - _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[22].backends[3])); - _register_command_CCV_NNC_EWSUM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[23].backends[3])); - _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[96].backends[5])); - _register_command_CCV_NNC_EWDIV_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[97].backends[5])); - _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[54].backends[5])); - _register_command_CCV_NNC_EWEXP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[55].backends[5])); - _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[38].backends[5])); - _register_command_CCV_NNC_EWLOG_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[39].backends[5])); - _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[74].backends[5])); - _register_command_CCV_NNC_EWSQRT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[75].backends[5])); - _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[82].backends[5])); - _register_command_CCV_NNC_CLAMP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[83].backends[5])); - _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[98].backends[5])); - _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[99].backends[5])); - _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[50].backends[5])); - _register_command_CCV_NNC_INDEX_SELECT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[51].backends[5])); - _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[136].backends[3])); - _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[137].backends[3])); + _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[114].backends[5])); + _register_command_CCV_NNC_ADAM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[115].backends[5])); + _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[36].backends[5])); + _register_command_CCV_NNC_ADAMW_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[37].backends[5])); + _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUBLAS(&(init_map[60].backends[0])); + _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUBLAS(&(init_map[61].backends[0])); + _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[116].backends[3])); + _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[117].backends[3])); + _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[112].backends[3])); + _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[113].backends[3])); + _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[108].backends[3])); + _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[109].backends[3])); + _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[132].backends[5])); + _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[133].backends[5])); + _register_command_CCV_NNC_COMM_ALLREDUCE_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[38].backends[1])); + _register_command_CCV_NNC_COMM_ALLREDUCE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[39].backends[1])); + _register_command_CCV_NNC_COMM_BROADCAST_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[126].backends[1])); + _register_command_CCV_NNC_COMM_BROADCAST_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[127].backends[1])); + _register_command_CCV_NNC_COMM_REDUCE_FORWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[128].backends[1])); + _register_command_CCV_NNC_COMM_REDUCE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_NCCL(&(init_map[129].backends[1])); + _register_command_CCV_NNC_MIN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[34].backends[5])); + _register_command_CCV_NNC_MIN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[35].backends[5])); + _register_command_CCV_NNC_MAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[64].backends[5])); + _register_command_CCV_NNC_MAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[65].backends[5])); + _register_command_CCV_NNC_COMPRESSION_LSSC_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[122].backends[5])); + _register_command_CCV_NNC_COMPRESSION_LSSC_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[123].backends[5])); + _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[110].backends[3])); + _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[111].backends[3])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[62].backends[3])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[63].backends[3])); + _register_command_CCV_NNC_DROPOUT_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[24].backends[3])); + _register_command_CCV_NNC_DROPOUT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[25].backends[3])); + _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[120].backends[3])); + _register_command_CCV_NNC_EWSUM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[121].backends[3])); + _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[84].backends[5])); + _register_command_CCV_NNC_EWDIV_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[85].backends[5])); + _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[70].backends[5])); + _register_command_CCV_NNC_EWEXP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[71].backends[5])); + _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[106].backends[5])); + _register_command_CCV_NNC_EWLOG_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[107].backends[5])); + _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[26].backends[5])); + _register_command_CCV_NNC_EWSQRT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[27].backends[5])); + _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[46].backends[5])); + _register_command_CCV_NNC_CLAMP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[47].backends[5])); + _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[88].backends[5])); + _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[89].backends[5])); + _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[40].backends[5])); + _register_command_CCV_NNC_INDEX_SELECT_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[41].backends[5])); + _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[30].backends[3])); + _register_command_CCV_NNC_REDUCE_ISNAN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[31].backends[3])); _register_command_CCV_NNC_LAMB_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[66].backends[5])); _register_command_CCV_NNC_LAMB_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[67].backends[5])); - _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[72].backends[5])); - _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[73].backends[5])); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[80].backends[5])); - _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[81].backends[5])); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[88].backends[5])); - _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[89].backends[5])); - _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[108].backends[5])); - _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[109].backends[5])); - _register_command_CCV_NNC_SMOOTH_L1_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[86].backends[5])); - _register_command_CCV_NNC_SMOOTH_L1_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[87].backends[5])); - _register_command_CCV_NNC_NMS_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[56].backends[5])); - _register_command_CCV_NNC_NMS_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[57].backends[5])); + _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[130].backends[5])); + _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[131].backends[5])); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[78].backends[5])); + _register_command_CCV_NNC_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[79].backends[5])); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[118].backends[5])); + _register_command_CCV_NNC_CATEGORICAL_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[119].backends[5])); + _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[44].backends[5])); + _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[45].backends[5])); + _register_command_CCV_NNC_SMOOTH_L1_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[94].backends[5])); + _register_command_CCV_NNC_SMOOTH_L1_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[95].backends[5])); + _register_command_CCV_NNC_NMS_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[22].backends[5])); + _register_command_CCV_NNC_NMS_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[23].backends[5])); _register_command_CCV_NNC_BATCH_NORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[0].backends[3])); _register_command_CCV_NNC_BATCH_NORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[1].backends[3])); - _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[30].backends[3])); - _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[31].backends[3])); - _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[46].backends[3])); - _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[47].backends[3])); - _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[58].backends[3])); - _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[59].backends[3])); - _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[122].backends[5])); - _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[123].backends[5])); - _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[128].backends[3])); - _register_command_CCV_NNC_MAX_POOL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[129].backends[3])); - _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[52].backends[3])); - _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[53].backends[3])); - _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[48].backends[5])); - _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[49].backends[5])); - _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[20].backends[5])); - _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[21].backends[5])); - _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[110].backends[3])); - _register_command_CCV_NNC_REDUCE_SUM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[111].backends[3])); - _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[6].backends[3])); - _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[7].backends[3])); - _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[78].backends[3])); - _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[79].backends[3])); - _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[134].backends[5])); - _register_command_CCV_NNC_ARGMAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[135].backends[5])); - _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[4].backends[5])); - _register_command_CCV_NNC_ARGMIN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[5].backends[5])); - _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[76].backends[3])); - _register_command_CCV_NNC_RELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[77].backends[3])); - _register_command_CCV_NNC_RMSPROP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[42].backends[5])); - _register_command_CCV_NNC_RMSPROP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[43].backends[5])); - _register_command_CCV_NNC_LSTM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[10].backends[3])); - _register_command_CCV_NNC_LSTM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[11].backends[3])); - _register_command_CCV_NNC_ROI_ALIGN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[34].backends[5])); - _register_command_CCV_NNC_ROI_ALIGN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[35].backends[5])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[28].backends[5])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[29].backends[5])); - _register_command_CCV_NNC_SGD_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[44].backends[5])); - _register_command_CCV_NNC_SGD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[45].backends[5])); - _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[84].backends[3])); - _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[85].backends[3])); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[120].backends[5])); - _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[121].backends[5])); - _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[100].backends[3])); - _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[101].backends[3])); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[116].backends[3])); - _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[117].backends[3])); - _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[32].backends[5])); - _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[33].backends[5])); - _register_command_CCV_NNC_TANH_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[106].backends[3])); - _register_command_CCV_NNC_TANH_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[107].backends[3])); - _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[70].backends[5])); - _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[71].backends[5])); - _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[26].backends[3])); - _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[27].backends[3])); - _register_command_CCV_NNC_MASKED_FILL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[40].backends[5])); - _register_command_CCV_NNC_MASKED_FILL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[41].backends[5])); - _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[90].backends[5])); - _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[91].backends[5])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[16].backends[3])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[17].backends[3])); - _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[36].backends[3])); - _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[37].backends[3])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[18].backends[5])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[19].backends[5])); + _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[136].backends[3])); + _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[137].backends[3])); + _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[100].backends[3])); + _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[101].backends[3])); + _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[86].backends[3])); + _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[87].backends[3])); + _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[12].backends[5])); + _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[13].backends[5])); + _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[92].backends[3])); + _register_command_CCV_NNC_MAX_POOL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[93].backends[3])); + _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[104].backends[3])); + _register_command_CCV_NNC_AVERAGE_POOL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[105].backends[3])); + _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[58].backends[5])); + _register_command_CCV_NNC_RANDOM_UNIFORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[59].backends[5])); + _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[56].backends[5])); + _register_command_CCV_NNC_RANDOM_NORMAL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[57].backends[5])); + _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[68].backends[3])); + _register_command_CCV_NNC_REDUCE_SUM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[69].backends[3])); + _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[102].backends[3])); + _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[103].backends[3])); + _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[54].backends[3])); + _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[55].backends[3])); + _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[82].backends[5])); + _register_command_CCV_NNC_ARGMAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[83].backends[5])); + _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[124].backends[5])); + _register_command_CCV_NNC_ARGMIN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[125].backends[5])); + _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[20].backends[3])); + _register_command_CCV_NNC_RELU_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[21].backends[3])); + _register_command_CCV_NNC_RMSPROP_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[80].backends[5])); + _register_command_CCV_NNC_RMSPROP_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[81].backends[5])); + _register_command_CCV_NNC_LSTM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[2].backends[3])); + _register_command_CCV_NNC_LSTM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[3].backends[3])); + _register_command_CCV_NNC_ROI_ALIGN_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[72].backends[5])); + _register_command_CCV_NNC_ROI_ALIGN_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[73].backends[5])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[18].backends[5])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[19].backends[5])); + _register_command_CCV_NNC_SGD_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[16].backends[5])); + _register_command_CCV_NNC_SGD_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[17].backends[5])); + _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[96].backends[3])); + _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[97].backends[3])); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[74].backends[5])); + _register_command_CCV_NNC_SIGMOID_BINARY_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[75].backends[5])); + _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[50].backends[3])); + _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[51].backends[3])); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[42].backends[3])); + _register_command_CCV_NNC_SOFTMAX_CROSSENTROPY_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[43].backends[3])); + _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[90].backends[5])); + _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[91].backends[5])); + _register_command_CCV_NNC_TANH_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[28].backends[3])); + _register_command_CCV_NNC_TANH_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[29].backends[3])); + _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[48].backends[5])); + _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[49].backends[5])); + _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[4].backends[3])); + _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[5].backends[3])); + _register_command_CCV_NNC_MASKED_FILL_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[76].backends[5])); + _register_command_CCV_NNC_MASKED_FILL_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[77].backends[5])); + _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[6].backends[5])); + _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[7].backends[5])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[98].backends[3])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[99].backends[3])); + _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[52].backends[3])); + _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_GPU_CUDNN(&(init_map[53].backends[3])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[32].backends[5])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_GPU_REF(&(init_map[33].backends[5])); #endif #ifdef HAVE_MPS - _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[126].backends[6])); - _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[92].backends[6])); - _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[64].backends[6])); - _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[65].backends[6])); - _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[68].backends[6])); - _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[69].backends[6])); - _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[62].backends[6])); - _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[63].backends[6])); - _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[24].backends[6])); - _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[25].backends[6])); - _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[14].backends[6])); - _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[132].backends[6])); - _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[133].backends[6])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[102].backends[6])); - _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[103].backends[6])); - _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[22].backends[6])); - _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[96].backends[6])); - _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[54].backends[6])); - _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[38].backends[6])); - _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[74].backends[6])); - _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[82].backends[6])); - _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[98].backends[6])); - _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[99].backends[6])); - _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[50].backends[6])); - _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[136].backends[6])); - _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[72].backends[6])); - _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[73].backends[6])); - _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[108].backends[6])); - _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[109].backends[6])); - _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[30].backends[6])); - _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[31].backends[6])); - _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[46].backends[6])); - _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[47].backends[6])); - _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[58].backends[6])); - _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[59].backends[6])); - _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[122].backends[6])); - _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[123].backends[6])); - _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[128].backends[6])); - _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[52].backends[6])); - _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[48].backends[6])); - _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[20].backends[6])); - _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[110].backends[6])); - _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[6].backends[6])); - _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[7].backends[6])); - _register_command_CCV_NNC_REDUCE_MAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[12].backends[6])); - _register_command_CCV_NNC_REDUCE_MIN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[8].backends[6])); - _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[78].backends[6])); - _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[79].backends[6])); - _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[134].backends[6])); - _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[4].backends[6])); - _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[76].backends[6])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[28].backends[6])); - _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[29].backends[6])); - _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[84].backends[6])); - _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[85].backends[6])); - _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[100].backends[6])); - _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[101].backends[6])); - _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[32].backends[6])); - _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[33].backends[6])); - _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[70].backends[6])); - _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[71].backends[6])); - _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[26].backends[6])); - _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[27].backends[6])); - _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[90].backends[6])); - _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[91].backends[6])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[16].backends[6])); - _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[17].backends[6])); - _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[36].backends[6])); - _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[37].backends[6])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[18].backends[6])); - _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[19].backends[6])); + _register_command_CCV_NNC_ADAM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[114].backends[6])); + _register_command_CCV_NNC_ADAMW_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[36].backends[6])); + _register_command_CCV_NNC_GEMM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[60].backends[6])); + _register_command_CCV_NNC_GEMM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[61].backends[6])); + _register_command_CCV_NNC_ADD_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[116].backends[6])); + _register_command_CCV_NNC_ADD_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[117].backends[6])); + _register_command_CCV_NNC_MUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[112].backends[6])); + _register_command_CCV_NNC_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[113].backends[6])); + _register_command_CCV_NNC_SCALAR_MUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[108].backends[6])); + _register_command_CCV_NNC_SCALAR_MUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[109].backends[6])); + _register_command_CCV_NNC_CMUL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[132].backends[6])); + _register_command_CCV_NNC_CMUL_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[133].backends[6])); + _register_command_CCV_NNC_CONVOLUTION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[110].backends[6])); + _register_command_CCV_NNC_CONVOLUTION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[111].backends[6])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[62].backends[6])); + _register_command_CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[63].backends[6])); + _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[120].backends[6])); + _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[84].backends[6])); + _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[70].backends[6])); + _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[106].backends[6])); + _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[26].backends[6])); + _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[46].backends[6])); + _register_command_CCV_NNC_GELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[88].backends[6])); + _register_command_CCV_NNC_GELU_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[89].backends[6])); + _register_command_CCV_NNC_INDEX_SELECT_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[40].backends[6])); + _register_command_CCV_NNC_REDUCE_ISNAN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[30].backends[6])); + _register_command_CCV_NNC_LEAKY_RELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[130].backends[6])); + _register_command_CCV_NNC_LEAKY_RELU_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[131].backends[6])); + _register_command_CCV_NNC_MSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[44].backends[6])); + _register_command_CCV_NNC_MSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[45].backends[6])); + _register_command_CCV_NNC_LAYER_NORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[136].backends[6])); + _register_command_CCV_NNC_LAYER_NORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[137].backends[6])); + _register_command_CCV_NNC_GROUP_NORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[100].backends[6])); + _register_command_CCV_NNC_GROUP_NORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[101].backends[6])); + _register_command_CCV_NNC_RMSNORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[86].backends[6])); + _register_command_CCV_NNC_RMSNORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[87].backends[6])); + _register_command_CCV_NNC_PAD_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[12].backends[6])); + _register_command_CCV_NNC_PAD_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[13].backends[6])); + _register_command_CCV_NNC_MAX_POOL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[92].backends[6])); + _register_command_CCV_NNC_AVERAGE_POOL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[104].backends[6])); + _register_command_CCV_NNC_RANDOM_UNIFORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[58].backends[6])); + _register_command_CCV_NNC_RANDOM_NORMAL_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[56].backends[6])); + _register_command_CCV_NNC_REDUCE_SUM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[68].backends[6])); + _register_command_CCV_NNC_REDUCE_MEAN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[102].backends[6])); + _register_command_CCV_NNC_REDUCE_MEAN_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[103].backends[6])); + _register_command_CCV_NNC_REDUCE_MAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[10].backends[6])); + _register_command_CCV_NNC_REDUCE_MIN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[134].backends[6])); + _register_command_CCV_NNC_REDUCE_NORM2_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[54].backends[6])); + _register_command_CCV_NNC_REDUCE_NORM2_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[55].backends[6])); + _register_command_CCV_NNC_ARGMAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[82].backends[6])); + _register_command_CCV_NNC_ARGMIN_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[124].backends[6])); + _register_command_CCV_NNC_RELU_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[20].backends[6])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[18].backends[6])); + _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[19].backends[6])); + _register_command_CCV_NNC_SIGMOID_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[96].backends[6])); + _register_command_CCV_NNC_SIGMOID_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[97].backends[6])); + _register_command_CCV_NNC_SOFTMAX_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[50].backends[6])); + _register_command_CCV_NNC_SOFTMAX_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[51].backends[6])); + _register_command_CCV_NNC_SWISH_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[90].backends[6])); + _register_command_CCV_NNC_SWISH_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[91].backends[6])); + _register_command_CCV_NNC_UPSAMPLE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[48].backends[6])); + _register_command_CCV_NNC_UPSAMPLE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[49].backends[6])); + _register_command_CCV_NNC_SET_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[4].backends[6])); + _register_command_CCV_NNC_SET_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[5].backends[6])); + _register_command_CCV_NNC_DATA_TRANSFER_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[6].backends[6])); + _register_command_CCV_NNC_DATA_TRANSFER_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[7].backends[6])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[98].backends[6])); + _register_command_CCV_NNC_FORMAT_TRANSFORM_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[99].backends[6])); + _register_command_CCV_NNC_TRANSPOSE_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[52].backends[6])); + _register_command_CCV_NNC_TRANSPOSE_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[53].backends[6])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_FORWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[32].backends[6])); + _register_command_CCV_NNC_DATATYPE_CONVERSION_BACKWARD_backend_CCV_NNC_BACKEND_MPS(&(init_map[33].backends[6])); #endif } diff --git a/lib/nnc/mfa/ccv_nnc_mfa_cmul.cpp b/lib/nnc/mfa/ccv_nnc_mfa_cmul.cpp index 171a5e20a..b575ff531 100644 --- a/lib/nnc/mfa/ccv_nnc_mfa_cmul.cpp +++ b/lib/nnc/mfa/ccv_nnc_mfa_cmul.cpp @@ -26,6 +26,7 @@ void ccv_nnc_mfa_encode_cmul(ccv_nnc_mfa_context_t* context, ccv_nnc_mfa_cmul_pa CCV_NNC_MFA_PRECONDITION(num_tensors == 3); CMulDescriptor descriptor; + descriptor.conjugate = params.conjugate ? 1 : 0; descriptor.memoryPrecision = (params.data_type == MTL::DataTypeFloat) ? GEMMOperandPrecision::FP32 : GEMMOperandPrecision::FP16; descriptor.stridesA[0] = params.astride[0]; descriptor.stridesA[1] = params.astride[1]; diff --git a/lib/nnc/mfa/ccv_nnc_mfa_cmul.hpp b/lib/nnc/mfa/ccv_nnc_mfa_cmul.hpp index f65810f29..e22234efc 100644 --- a/lib/nnc/mfa/ccv_nnc_mfa_cmul.hpp +++ b/lib/nnc/mfa/ccv_nnc_mfa_cmul.hpp @@ -2,6 +2,7 @@ #define GUARD_ccv_nnc_mfa_cmul_hpp typedef struct { + uint8_t conjugate; uint64_t data_type; uint32_t astride[3]; uint32_t bstride[3]; diff --git a/lib/nnc/mfa/v2/CMulDescriptor.cpp b/lib/nnc/mfa/v2/CMulDescriptor.cpp index f54fcd61d..f9852b6f1 100644 --- a/lib/nnc/mfa/v2/CMulDescriptor.cpp +++ b/lib/nnc/mfa/v2/CMulDescriptor.cpp @@ -6,6 +6,7 @@ bool CMulDescriptor::operator==(const CMulDescriptor& rhs) const { return memoryPrecision == rhs.memoryPrecision && + conjugate == rhs.conjugate && value == rhs.value && simd_all(stridesA == rhs.stridesA) && simd_all(stridesB == rhs.stridesB) && @@ -43,8 +44,9 @@ std::pair *> CMulDescriptor::fin }; CMulKernelDescriptor kernelDesc; - kernelDesc.memoryPrecision = memoryPrecision; + kernelDesc.conjugate = conjugate; kernelDesc.value = value; + kernelDesc.memoryPrecision = memoryPrecision; // WARNING: The owner must explicitly retain the compute pipeline. auto createPipeline = diff --git a/lib/nnc/mfa/v2/CMulDescriptor.hpp b/lib/nnc/mfa/v2/CMulDescriptor.hpp index e82937c59..57a7da0f5 100644 --- a/lib/nnc/mfa/v2/CMulDescriptor.hpp +++ b/lib/nnc/mfa/v2/CMulDescriptor.hpp @@ -8,9 +8,10 @@ #include "GEMMOperandPrecision.hpp" struct CMulKernelDescriptor { + uint8_t conjugate; + uint8_t value; GEMMOperandPrecision memoryPrecision; - unsigned int value; - constexpr bool operator==(const CMulKernelDescriptor &rhs) const { return value == rhs.value && memoryPrecision == rhs.memoryPrecision; } + constexpr bool operator==(const CMulKernelDescriptor &rhs) const { return value == rhs.value && memoryPrecision == rhs.memoryPrecision && conjugate == rhs.conjugate; } }; template<> @@ -22,7 +23,9 @@ struct std::hash struct CMulKernel; struct CMulDescriptor { - unsigned int value; + uint8_t conjugate; + + uint8_t value; GEMMOperandPrecision memoryPrecision; diff --git a/lib/nnc/mfa/v2/CMulKernel.cpp b/lib/nnc/mfa/v2/CMulKernel.cpp index b84b6e2fd..ac0f32b15 100644 --- a/lib/nnc/mfa/v2/CMulKernel.cpp +++ b/lib/nnc/mfa/v2/CMulKernel.cpp @@ -5,10 +5,12 @@ CMulKernel::CMulKernel(CMulKernelDescriptor descriptor, MTL::Device *const device) { - memoryPrecision = descriptor.memoryPrecision; + conjugate = descriptor.conjugate; value = descriptor.value; + memoryPrecision = descriptor.memoryPrecision; + source = createSource(); threadgroupMemoryAllocation = createThreadgroupMemoryAllocation(); @@ -38,8 +40,119 @@ unsigned short CMulKernel::createThreadgroupMemoryAllocation() const noexcept { std::string CMulKernel::createSource() const noexcept { std::string shader = createConstants() + "\n"; - if (value == 0) { - shader += R"( + if (conjugate) { + if (value == 0) { + shader += R"( +#include +using namespace metal; + +kernel void cmul( + device real *src0 [[buffer(0)]], + device real *src1 [[buffer(1)]], + device real *destination [[buffer(2)]], + + uint3 tpig [[thread_position_in_grid]] +) { + const uint idx = tpig.x; + if (idx >= dim0) + return; + const float a0 = (float)src0[idx * 2]; + const float a1 = (float)src0[idx * 2 + 1]; + const float b0 = (float)src1[idx * 2]; + const float b1 = (float)src1[idx * 2 + 1]; + destination[idx * 2] = (real)(a0 * b0 + a1 * b1); + destination[idx * 2 + 1] = (real)(-a0 * b1 + a1 * b0); +} + )"; + } else if (value == 1) { + shader += R"( +#include +using namespace metal; + +kernel void cmul( + device real *src0 [[buffer(0)]], + device real *src1 [[buffer(1)]], + device real *destination [[buffer(2)]], + + uint3 tpig [[thread_position_in_grid]] +) { + const uint x = tpig.x; + const uint y = tpig.y; + if (y >= dim1 || x >= dim0) + return; + const uint ida = y * astride0 + x * 2; + const uint idb = y * bstride0 + x * 2; + const uint idc = y * cstride0 + x * 2; + const float a0 = (float)src0[ida]; + const float a1 = (float)src0[ida + 1]; + const float b0 = (float)src1[idb]; + const float b1 = (float)src1[idb + 1]; + destination[idc] = (real)(a0 * b0 + a1 * b1); + destination[idc + 1] = (real)(-a0 * b1 + a1 * b0); +} + )"; + } else if (value == 2) { + shader += R"( +#include +using namespace metal; + +kernel void cmul( + device real *src0 [[buffer(0)]], + device real *src1 [[buffer(1)]], + device real *destination [[buffer(2)]], + + uint3 tpig [[thread_position_in_grid]] +) { + const uint x = tpig.x; + const uint y = tpig.y; + const uint z = tpig.z; + if (y >= dim1 || x >= dim0) + return; + const uint ida = z * astride1 + y * astride0 + x * 2; + const uint idb = z * bstride1 + y * bstride0 + x * 2; + const uint idc = z * cstride1 + y * cstride0 + x * 2; + const float a0 = (float)src0[ida]; + const float a1 = (float)src0[ida + 1]; + const float b0 = (float)src1[idb]; + const float b1 = (float)src1[idb + 1]; + destination[idc] = (real)(a0 * b0 + a1 * b1); + destination[idc + 1] = (real)(-a0 * b1 + a1 * b0); +} + )"; + } else { + shader += R"( +#include +using namespace metal; + +kernel void cmul( + device real *src0 [[buffer(0)]], + device real *src1 [[buffer(1)]], + device real *destination [[buffer(2)]], + + uint3 tpig [[thread_position_in_grid]] +) { + const uint x = tpig.x; + const uint y = tpig.y; + const uint z = tpig.z; + if (y >= dim1 || x >= dim0) + return; + const int u = z % dim2; + const int v = z / dim2; + const uint ida = v * astride2 + u * astride1 + y * astride0 + x * 2; + const uint idb = v * bstride2 + u * bstride1 + y * bstride0 + x * 2; + const uint idc = v * cstride2 + u * cstride1 + y * cstride0 + x * 2; + const float a0 = (float)src0[ida]; + const float a1 = (float)src0[ida + 1]; + const float b0 = (float)src1[idb]; + const float b1 = (float)src1[idb + 1]; + destination[idc] = (real)(a0 * b0 + a1 * b1); + destination[idc + 1] = (real)(-a0 * b1 + a1 * b0); +} + )"; + } + } else { + if (value == 0) { + shader += R"( #include using namespace metal; @@ -61,8 +174,8 @@ kernel void cmul( destination[idx * 2 + 1] = (real)(a0 * b1 + a1 * b0); } )"; - } else if (value == 1) { - shader += R"( + } else if (value == 1) { + shader += R"( #include using namespace metal; @@ -88,8 +201,8 @@ kernel void cmul( destination[idc + 1] = (real)(a0 * b1 + a1 * b0); } )"; - } else if (value == 2) { - shader += R"( + } else if (value == 2) { + shader += R"( #include using namespace metal; @@ -116,8 +229,8 @@ kernel void cmul( destination[idc + 1] = (real)(a0 * b1 + a1 * b0); } )"; - } else { - shader += R"( + } else { + shader += R"( #include using namespace metal; @@ -146,6 +259,7 @@ kernel void cmul( destination[idc + 1] = (real)(a0 * b1 + a1 * b0); } )"; + } } return shader; } diff --git a/lib/nnc/mfa/v2/CMulKernel.hpp b/lib/nnc/mfa/v2/CMulKernel.hpp index 9ce015117..cdbf0b912 100644 --- a/lib/nnc/mfa/v2/CMulKernel.hpp +++ b/lib/nnc/mfa/v2/CMulKernel.hpp @@ -15,9 +15,11 @@ struct CMulKernel { /// The number of threads per group. MTL::Size threadgroupSize; - GEMMOperandPrecision memoryPrecision; + uint8_t conjugate; + + uint8_t value; - unsigned int value; + GEMMOperandPrecision memoryPrecision; CMulKernel(CMulKernelDescriptor descriptor, MTL::Device *const device); diff --git a/test/int/nnc/cublas.tests.c b/test/int/nnc/cublas.tests.c index baf853f33..4ecd0d6a1 100644 --- a/test/int/nnc/cublas.tests.c +++ b/test/int/nnc/cublas.tests.c @@ -3098,7 +3098,7 @@ TEST_CASE("cmul in float, broadcast semantics") TEST_CASE("cmul gradient in float") { - GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF)); + GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS)); ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "a"); ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 20, 10), "b"); @@ -3157,7 +3157,7 @@ TEST_CASE("cmul gradient in float") TEST_CASE("cmul gradient in half precision") { - GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF)); + GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS)); ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new(); ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "a"); ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 16F, 20, 10), "b");